In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score

from sklearn import tree

from matplotlib import pyplot as plt

import lime
import lime.lime_tabular

import plotly.graph_objects as go

In [2]:
def read_csv(filepath):
#   sep = none?
    data = pd.read_csv(filepath)
    return data

In [3]:
def split_to_2_predictors(dataset1, dataset2, features):
    predictor1 = dataset1[features]
    predictor2 = dataset2[features]
    return predictor1, predictor2

In [4]:
def split_to_2_targets(dataset1, dataset2, outcome):
    target1 = dataset1[outcome]
    target2 = dataset2[outcome]
    return target1, target2

In [5]:
def train_test_split_data(dataset_X, dataset_y):
    return train_test_split(dataset_X, dataset_y, random_state=1, shuffle = True, stratify=dataset_y)

In [6]:
def create_decision_tree(criterion="gini", splitter="best", max_depth=6):
    model = DecisionTreeClassifier(criterion=criterion, splitter=splitter, random_state=1, max_depth=max_depth)
    return model

In [7]:
def fit_decision_tree(train_X, train_y, model):
    model.fit(train_X, train_y)

In [8]:
def test_decision_tree(model, testcases):
    return model.predict(testcases)

In [9]:
def calculate_accuracy(testcases, prediction_result):
    return accuracy_score(testcases, prediction_result) * 100.00

In [10]:
def train_decision_tree_agreed_disagreed(all_cases, features):
    train_X = all_cases[features]
    train_y = all_cases.Agreed
    model = create_decision_tree()
    fit_decision_tree(train_X, train_y, model)
    return model

In [11]:
# class_names e.g. ['0', '1']
def visualise_decision_tree(model, features, class_names):
    fig = plt.figure(figsize=(40,20))
    visualisation = tree.plot_tree(model, feature_names=features, class_names=class_names, filled=True)
    return fig

In [12]:
def save_ML_model(file_name_for_save, model_to_save):
    filehandler = open(file_name_for_save + '.obj', 'wb') 
    pickle.dump(model_to_save, filehandler)

# For users:

In [13]:
def load_ML_model(file_to_open):
    filehandler = open(file_to_open, 'rb') 
    return pickle.load(filehandler)

In [14]:
def calculate_similarity(prediction_result1, prediction_result2):
    return accuracy_score(prediction_result1, prediction_result2) * 100.00

In [15]:
def number_of_disagreed_cases(testcases, similarity):
    wrong_proportion = 1 - (similarity / 100)
    return round(len(testcases) * wrong_proportion)

In [16]:
def return_disagreed_cases(testcases, prediction_result1, prediction_result2):
    filters = np.logical_xor(prediction_result1, prediction_result2)
    disagreed = testcases[filters]
    disagreed['Agreed'] = 0
    return disagreed

In [17]:
def return_agreed_cases(testcases, prediction_result1, prediction_result2):
    filters = return_disagreed_cases(testcases,prediction_result1, prediction_result2)
    agreed = testcases.loc[testcases.index.difference(filters.index)]
    agreed['Agreed'] = 1
    return agreed

In [18]:
def disagreed_LIME(train1, train2, all_disagreed_testcases, model1, model2, features, classes):
    train1_numpy = train1.to_numpy()
    train2_numpy = train2.to_numpy()
    all_disagreed_testcases_numpy = all_disagreed_testcases[features].to_numpy()

    explainer = lime.lime_tabular.LimeTabularExplainer(train1_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    i = np.random.randint(0, len(all_disagreed_testcases_numpy))
    disagreed_case = all_disagreed_testcases_numpy[i]

    exp1 = explainer.explain_instance(disagreed_case, model1.predict_proba, num_features=len(features))
    exp1_map = exp1.as_map()
    exp1.show_in_notebook()

    explainer = lime.lime_tabular.LimeTabularExplainer(train2_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    exp2 = explainer.explain_instance(disagreed_case, model2.predict_proba, num_features=len(features))
    exp2_map = exp2.as_map()
    exp2.show_in_notebook()
    
    exp1_result = classes[model1.predict(disagreed_case.reshape(1,-1))[0]]
    exp2_result = classes[model2.predict(disagreed_case.reshape(1,-1))[0]]
    
    return exp1_map[1], exp2_map[1], disagreed_case, exp1_result, exp2_result

In [19]:
#     return feature_importance_output.plot.barh()
def print_feature_importance(fittedmodel, features):
    feature_importance = fittedmodel.feature_importances_
    feature_importance_output = pd.DataFrame(feature_importance, features)
    feature_importance_output.set_axis(['Output'], axis=1, inplace=True)
    print(feature_importance_output)
    ax = feature_importance_output.plot.barh()

In [None]:
def feature_importance_for_disagreement(features, agreed_cases, disagreed_cases, model = None):
    all_cases = pd.DataFrame(agreed_cases.append(disagreed_cases))
    X_train = all_cases[features]
    y_train = all_cases['Agreed']
    if model == None:
        model = create_decision_tree()
        fit_decision_tree(X_train, y_train, model)
    print_feature_importance(model, features)

labels = [class1_v1, class2_v1, …classk_v1,
          class1, class2, …, classk, 
               class1_v2, class2_v2, …, classk_v2] // 3k labels <br>

Source = [0, 0, …, 0, 1,1,..., 1, … {k blocks}, 
	    k, k, …, k, k+1, k+1, …, k+1, k+2, … ] // k*k*2 <br>
        
Target = [k, k+1, …, 2k {each of the ground truth classes}, k, k+1, …,{k blocks of k length}
                2k, 2k+1, …, 3k-1, 2k, 2k+1, …, 3k-1, … {k blocks of k length} ] // k*k*2 <br>
                
Value = [num(predicted class1 by v1 that are of class1), 
              num(predicted class1 by v1 that are of class2), …,
              num(predicted class1 by v1 that are of classk),
	…,
              num(class 1 where v2 predicted class1), {source k to target 2k}
              num(class 1 where v2 predicted class2), {source k to target 2k+1}…,
              num(class 1 where v2 predicted classk),
              … ] // k*k*2

labels = ['Younger Diabetes', 'Younger No Diabetes', 'Diabetes', 'No Diabetes', 'Older Diabetes', 'Older No Diabetes']

sources = [0, 0, 1, 1, 4, 4, 5, 5] 

targets = [2, 3, 2, 3, 2, 3, 2, 3]

- #num(predicted class1 by v1 that are of class1), 
- #num(predicted class1 by v1 that are of class2), …,

values = [younger_y_pred_confusion[1][1], younger_y_pred_confusion[1][0],
          younger_y_pred_confusion[0][1], younger_y_pred_confusion[0][0],
          older_y_pred_opposite_confusion[1][1], older_y_pred_opposite_confusion[1][0],
          older_y_pred_opposite_confusion[0][1], older_y_pred_opposite_confusion[0][0],]

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
      color = "blue"
    ),
    link = dict(
      source = sources,
      target = targets,
      value = values
  ))])

fig.update_layout(title_text="Sankey Diagram", font_size=10)
fig.show()

In [20]:
def sankey_diagram(confusion_matrix1, confusion_matrix2, classes):
    number_of_classes = len(classes) # 2
    
    labels = classes * 3
    
    sources = list()
    for i in range(0, number_of_classes * number_of_classes):
        for j in range(1, 1 + number_of_classes):
            sources.append(i)
    
    targets = list()
    for i in range(number_of_classes):
        for j in range(number_of_classes, 2 * number_of_classes):
            targets.append(j)
    for i in range(number_of_classes):
        for j in range(2 * number_of_classes, 3 * number_of_classes):
            targets.append(j)
    
    values = [confusion_matrix1[1][1], confusion_matrix1[0][1],
             confusion_matrix1[1][0], confusion_matrix1[0][0],
             confusion_matrix2[1][1], confusion_matrix2[0][1],
             confusion_matrix2[1][0], confusion_matrix2[0][0]]
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = labels,
            color = "blue"
        ),
        link = dict(
            source = sources,
            target = targets,
            value = values
        ))])

    fig.update_layout(title_text="Sankey Diagram", font_size=10)
    fig.show()