# XAI Analytics Notebook

In [None]:
import os
import xai
import logging as log
import warnings
import matplotlib.pyplot as plt

from shap import initjs
from ipywidgets import Button, GridBox, Layout, ButtonStyle, Label, Dropdown, Text, Output, IntSlider, Checkbox
from util.commons import *
from util.ui import *
from util.model import *
from util.split import *
from util.dataset import *
from IPython.display import clear_output, display, HTML

initjs()
%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

## Data

It can be any unprocessed fact, value, text, sound or picture that is not being interpreted and analyzed.

### Dataset Selection

In this step one could either choose one of the predefined datasets or use an external dataset.

In [None]:
dataset_select_label = Label(layout=Layout(width='auto', height='auto'), value='Select a dataset (you can specify custom one by selecting other):')
dataset_select_dropdown = Dropdown(options=[m.name for m in Datasets], value=None, layout=Layout(width='200px', height='auto'))
name_url_text_label = Label(value='Provide dataset name and URL:')
name_text = Text(description='Name: ', placeholder='e.g. Car Evaluation Data Set', disabled=True)
url_text = Text(description='URL: ', placeholder='e.g. https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data', disabled=True)
dataset_select_button = Button(description='Download dataset', layout=Layout(width='300px', height='auto'), style=ButtonStyle(button_color='green'), tooltip='Click me', icon='download', disabled=True)
dataset_select_output = Output()

display(dataset_select_label,
        dataset_select_dropdown,
        name_url_text_label,
        name_text,
        url_text,
        dataset_select_button,
        dataset_select_output)

def on_value_change_dataset_select_dropdown(change):
    dataset_select_output.clear_output()
    global dataset
    new_value = str(change['new'])
    if new_value == 'other':
        name_text.disabled=False
        url_text.disabled=False
        dataset_select_button.disabled=False
    else:
        name_text.disabled=True
        url_text.disabled=True
        dataset_select_button.disabled=True
        dataset, msg = get_dataset(new_value)
        with dataset_select_output:
            display(msg)
            display(dataset.df)
            
            
def on_click_dataset_select_button(self):
    dataset_select_output.clear_output()
    global dataset
    name = str(name_text.value)
    url = str(url_text.value)
    dataset, msg = get_dataset(name, url)
    with dataset_select_output:
        display(msg)
        display(dataset.df)

dataset_select_dropdown.observe(on_value_change_dataset_select_dropdown, names='value')
dataset_select_button.on_click(on_click_dataset_select_button)

### Data Preprocessing (Optional)

Data preprocessing is an integral step in the learning process as the quality of data and the useful information that can be derived from it directly affects the ability of our model to learn. Therefore, it can be helpfull to preprocess our data before feeding it into our model.

In [None]:
from pandas.api.types import is_numeric_dtype, is_string_dtype

strip_column_select_label = Label(layout=Layout(width='auto', height='auto'), value='Strip a column from the dataset:')
strip_column_select_dropdown = Dropdown(options=list(dataset.df.columns), value=None, layout=Layout(width='220px', height='auto'))
strip_button = Button(disabled=False, style=ButtonStyle(button_color='yellow'), tooltip='Strips everything except the selected value.', icon='bolt', layout=Layout(width='max-content', height='auto'))
strip_column_output = Output()
strip_column_output_inner = Output()

# back up initial dataset
df_backup = dataset.df

#defaults for the cell
eq_value = '='
df_stripped = None

def on_value_change_strip_column_select_dropdown(change):
    strip_column_output.clear_output()
    strip_column_output_inner.clear_output()
    strip_button.description = ''
    new_value = str(change['new'])
    if is_numeric_dtype(dataset.df[new_value]):
        eq_radio = init_strip_eq_radio(on_value_change_eq_radio)
        min_val, max_val, step=calculate_slider_properties(dataset.df[new_value].unique())
        value_slider = init_strip_value_slider(on_value_change_value_slider, min_val, max_val, step)
        with strip_column_output:
            display(eq_radio, value_slider, strip_button, strip_column_output_inner)
    elif is_string_dtype(dataset.df[new_value]):
        value_select_dropdown = init_strip_value_select_dropdown(on_value_change_value_select_dropdown, list(dataset.df[new_value].unique()))
        with strip_column_output:
            display(value_select_dropdown, strip_button, strip_column_output_inner)
            
def on_value_change_value_select_dropdown(change):
    strip_column_output_inner.clear_output()
    global df_stripped
    new_value = str(change['new'])
    with strip_column_output_inner:
        strip_button.description='{} \'{}\''.format(strip_column_select_dropdown.value, new_value)
        df_stripped = get_stripped_df(dataset.df, strip_column_select_dropdown.value, new_value)
        display(df_stripped)

def on_value_change_value_slider(change):
    strip_column_output_inner.clear_output()
    global df_stripped
    new_value = float(str(change['new']))    
    with strip_column_output_inner:
        strip_button.description='{} {} \'{}\''.format(strip_column_select_dropdown.value, eq_value, new_value)
        df_stripped = get_stripped_df(dataset.df, strip_column_select_dropdown.value, new_value, eq_value)
        display(df_stripped)

def on_value_change_eq_radio(change):
    global eq_value
    eq_value = str(change['new'])
            
def on_click_strip_button(self):
    strip_column_output.clear_output()
    global stack_label, df_stripped
    dataset.df = df_stripped
    dataset.df.reset_index(drop=True, inplace=True)
    stack_label.value = stack_label.value + strip_button.description + ', ' 
    with strip_column_output:
        display('{} selected successfully.'.format(strip_button.description), dataset.df)
    
def on_click_reset_button(self):
    strip_column_output.clear_output()
    dataset.df = df_backup
    dataset.df.reset_index(drop=True, inplace=True)
    global stack_label
    stack_label.value = 'Stripped columns for the dataset: '
    with strip_column_output:
        display('Dataset restored to its initial state.', dataset.df)

hbox = generate_reset_strip_hbox(on_click_reset_button)
stack_label = get_reset_strip_hbox_label(hbox)
    
strip_column_select_dropdown.observe(on_value_change_strip_column_select_dropdown, names='value')
strip_button.on_click(on_click_strip_button)

display(hbox, strip_column_select_label, strip_column_select_dropdown, strip_column_output)

### Data Visualization (Optional)

Data visualization is the graphical representation of information and data. By using visual elements like charts, graphs, and maps, data visualization tools provide an accessible way to see and understand trends, outliers, and patterns in data.

In [None]:
show_imbalance_selectmultiple = SelectMultiple(options=list(dataset.df.columns), rows=len(list(dataset.df.columns)) if len(list(dataset.df.columns)) <= 20 else 20, layout=Layout(width='auto', height='auto'))
show_imbalance_button = Button(description='Show imbalances', layout=Layout(width='auto', height='auto'), button_style='info', tooltip='Click me', icon='cubes')
correlations_matrix_button = Button(description='Correlations as a hierarchical dendogram', tooltip='Click me', icon='sitemap', layout=Layout(width='auto', height='auto'), disabled=False, style=ButtonStyle(button_color='darkseagreen'))
correlations_dendogram_button = Button(description='Correlations as a matrix', tooltip='Click me', icon='th-large', layout=Layout(width='auto', height='auto'), disabled=False, style=ButtonStyle(button_color='orange'))
show_imbalance_output = Output()

def on_click_show_imbalance_button(self):
    show_imbalance_output.clear_output()
    features_to_analyze = list(show_imbalance_selectmultiple.value)
    with show_imbalance_output:
        xai.imbalance_plot(dataset.df, *features_to_analyze)
        
def on_click_correlations_matrix_button(self):
    show_imbalance_output.clear_output()
    with show_imbalance_output:
        display(xai.correlations(dataset.df, include_categorical=True, plot_type="matrix"))
        
def on_click_correlations_dendogram_button(self):
    show_imbalance_output.clear_output()
    with show_imbalance_output:
        display(xai.correlations(dataset.df, include_categorical=True))

show_imbalance_button.on_click(on_click_show_imbalance_button)
correlations_matrix_button.on_click(on_click_correlations_matrix_button)
correlations_dendogram_button.on_click(on_click_correlations_dendogram_button)

grid_box = generate_analyze_grid(show_imbalance_selectmultiple,
                                 show_imbalance_button,
                                 correlations_dendogram_button,
                                 correlations_matrix_button)

display(grid_box, show_imbalance_output)

### Selecting a Target

Select a target for the models that later will be trained, interpreted and compared.
> Note:
 * Classification is the task of predicting a discrete class label.
 * Regression is the task of predicting a continuous quantity.


In [None]:
target_dropdown = Dropdown(options=list(dataset.df.columns), value=None, disabled=False)
target_select_button = Button(description='Select target', disabled=False, button_style='success', tooltip='Click me', icon='mouse-pointer')
target_output = Output()

display(target_dropdown, target_select_button, target_output)

def on_value_change_target_dropdown(change):
    target_output.clear_output()
    df_target, msg = show_target(dataset.df, change['new'])
    with target_output:
        display(df_target)

def on_click_target_select_button(self):
    target_output.clear_output()
    global df_X, df_y
    df_X, df_y, msg = split_feature_target(dataset.df, target_dropdown.value)
    with target_output:
        display(msg)

target_dropdown.observe(on_value_change_target_dropdown, names='value')
target_select_button.on_click(on_click_target_select_button)

## Training

Train up to ten models by using different properties and algorithms.

> Note: Hover the mouse over the description of a property in order to get more information about it.

In [None]:
models_label = Label(layout=Layout(width='auto', height='auto'), value='Choose the number of models to be used: ')
models_slider = IntSlider(value=1, min=1, max=8, step=1, disabled=False, continuous_update=False, orientation='horizontal', readout=True, readout_format='d')
models_output = Output()

def draw_grid():
    display(generate_model_grid(
        df_X,
        number_of_models,
        models,
        on_click_feature_exclude_button=on_click_feature_exclude_button,
        on_value_change_split_type_dropdown=on_value_change_split_type_dropdown,
        on_click_model_train_button=on_click_model_train_button))

def on_value_change_models_slider(change):
    models_output.clear_output()
    global number_of_models, models
    number_of_models = change['new']
    models, _ = fill_empty_models(df_X, df_y, number_of_models)
    with models_output:
        draw_grid()

models_slider.observe(on_value_change_models_slider, names='value')
display(models_label, models_slider, models_output)

def on_value_change_split_type_dropdown(change):
    model = get_model_by_split_type_dd(models, change['owner'])
    _ = change_cross_columns_status(model, change['new'])

def on_click_feature_exclude_button(self):
    models_output.clear_output()
    model = get_model_by_remove_features_button(models, self)
    msg = remove_model_features(model)
    with models_output:
        draw_grid()

def on_click_model_train_button(self):
    model = get_model_by_train_model_button(models, self)
    msg = fill_model(model)
    with models_output:
        display(msg)

# initially show only one model
with models_output:
    number_of_models = 1
    models, _ = fill_empty_models(df_X, df_y, number_of_models)

    draw_grid()

## Global Interpretations Methods

Global model interpretability is about understanding how the model makes decisions, based on a holistic view of its features and each of the learned components such as weights, other parameters, and structures. It helps to understand the distribution of your target outcome based on the features.

### Feature Importance
 
Feature importance is generic term for the degree to which a predictive model relies on a particular feature. Generally, a feature’s importance is the increase in the model’s prediction error after we permuted the feature’s values. 

#### ELI5
[ELI5](https://github.com/TeamHG-Memex/eli5) is a Python package which helps to debug machine learning classifiers and explain their predictions. It provides support for a wide variety of frameworks and packages and also implements several algorithms for inspecting black-box models.
 * ELI5's feature importance uses the simplest algorithm for feature importance. It just gives an explanation of estimator parameters (weights) for a given model.

#### Skater
[Skater](https://github.com/oracle/Skater)'s goal to demystify the inner workings of any type of predictive model that is language and framework agnostic. It supports algorithms to enable interpretability of supervised learning problems. The interpretation algorithms currently supported are post-hoc in nature. This approach helps us to apply interpretability to machine learning systems depending on the analytical use cases. The library has embraced object-oriented and functional programming paradigms as deemed necessary to provide scalability and concurrency while keeping code brevity in mind.
 * Skater's feature importance implementation is based on an information theoretic criteria. It measurs the entropy in the change of predictions, given a perturbation of a given feature. This means that the more a model’s decision depends on a feature, the more a prediction will change by perturbing this feature.

#### SHAP
[SHAP](https://github.com/slundberg/shap) (SHapley Additive exPlanations) is a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions.
 * SHAP's feature importance uses a combination of feature contributions and game theory to come up with SHAP values. Then, it computes the global feature importance by taking the average of the SHAP value magnitudes across the dataset.

In [None]:
feature_importance_grid = generate_feature_importance_grid(models=models)
feature_importance_button = Button(description='Generate feature importance plot(s)', disabled=False, layout=Layout(width='auto', height='auto'), button_style='info', tooltip='Click me', icon='bullseye')
feature_importance_output = Output()
explain_feature_importance_checkbox = Checkbox(value=False, description='Explain', disabled=False, indent=False)

display(feature_importance_grid, explain_feature_importance_checkbox, feature_importance_button, feature_importance_output)

def on_click_feature_importance_button(self):
    feature_importance_output.clear_output()
    type_value = get_child_value_by_description(feature_importance_grid, "Type", 0)
    models_names = get_child_value_by_description(feature_importance_grid, "Model(s)", 0)
    selected_models = get_models_by_names(models, models_names)
    explain = explain_feature_importance_checkbox.value
    for model in selected_models:
        with feature_importance_output:
            if type_value == 'SKATER':
                plt.rcParams['figure.figsize'] = [14, 15]
            else:
                %matplotlib inline
            plot = generate_feature_importance_plot(FeatureImportanceType[type_value], model)
            if plot:
                display(plot)
    with feature_importance_output:
        if explain:
            print(generate_feature_importance_explanation(FeatureImportanceType[type_value], models))

feature_importance_button.on_click(on_click_feature_importance_button)

### Partial Dependence Plot

The partial dependence plot (short PDP or PD plot) shows the marginal effect one or two features have on the predicted outcome of a machine learning model. A partial dependence plot can show whether the relationship between the target and a feature is linear, monotonic or more complex. For example, when applied to a linear regression model, partial dependence plots always show a linear relationship.

#### PDPBox
[PDPBox](https://github.com/SauceCat/PDPbox)'s goal is to visualize the impact of certain features towards model prediction for any supervised learning algorithm.


In [None]:
pdp_grid = generate_pdp_grid(models=models)
pdp_model_select_button = Button(description='Select model(s)', disabled=False, layout=Layout(width='auto', height='auto'), button_style='success', tooltip='Click me', icon='mouse-pointer')
pdp_output = Output()
# Inner output
pdp_feature_selection_grid = None
generate_pdp_plots_button = Button(description='Generate PDP plot(s)', disabled=False, layout=Layout(width='auto', height='auto'), button_style='info', tooltip='Click me', icon='bullseye')
generate_pdp_output = Output()

models_names = []

display(pdp_grid, pdp_model_select_button, pdp_output)

def on_click_pdp_model_select_button(self):
    pdp_output.clear_output()
    generate_pdp_output.clear_output()
    global models_names
    models_names = get_child_value_by_description(pdp_grid, "Model(s)", 0)
    global pdp_feature_selection_grid
    pdp_feature_selection_grid = generate_pdp_feature_selection_grid(get_models_by_names(models, models_names))
    with pdp_output:
        display(pdp_feature_selection_grid, generate_pdp_plots_button, generate_pdp_output)

def on_click_generate_pdp_plots_button(self):
    generate_pdp_output.clear_output()
    type_value = get_child_value_by_description(pdp_grid, "Type", 0)
    for model in models:
        if model.name in models_names:
            feature1 = get_child_value_by_description(pdp_feature_selection_grid, "... " + model.name, 0)
            feature2 = get_child_value_by_description(pdp_feature_selection_grid, "... " + model.name, 1)
            with generate_pdp_output:
                generate_pdp_plots(PDPType[type_value], model, feature1, feature2)
        
pdp_model_select_button.on_click(on_click_pdp_model_select_button)
generate_pdp_plots_button.on_click(on_click_generate_pdp_plots_button)

## Local Interpretation Methods

Local interpretation focuses on specifics of each individual and provides explanations that can lead to a better understanding of the feature contribution in smaller groups of individuals that are often overlooked by the global interpretation techniques.

#### SHAP
[SHAP](https://github.com/slundberg/shap) (SHapley Additive exPlanations) leverages the idea of Shapley values for model feature influence scoring. The technical definition of a Shapley value is the “average marginal contribution of a feature value over all possible coalitions.” In other words, Shapley values consider all possible predictions for an instance using all possible combinations of inputs. Because of this exhaustive approach, SHAP can guarantee properties like consistency and local accuracy. LIME, on the other hand, does not offer such guarantees.

#### LIME
[LIME](https://github.com/marcotcr/lime) (Local Interpretable Model-agnostic Explanations) builds sparse linear models around each prediction to explain how the black box model works in that local vicinity. While treating the model as a black box, we perturb the instance we want to explain and learn a sparse linear model around it, as an explanation. LIME has the advantage over *SHAP*, that it is a lot faster.

In [None]:
local_interpretation_grid = generate_local_interpretation_grid(models=models)
generate_local_interpretation_button = Button(description='Generate a Local Interpretation(s)', disabled=False, layout=Layout(width='auto', height='auto'), button_style='info', tooltip='Click me', icon='bullseye')
generate_local_interpretation_output = Output()
explain_local_interpretation_checkbox = Checkbox(value=False, description='Explain', disabled=False, indent=False)

display(local_interpretation_grid, explain_local_interpretation_checkbox, generate_local_interpretation_button, generate_local_interpretation_output)

def on_click_generate_local_interpretation_button(self):
    generate_local_interpretation_output.clear_output()
    type_value = get_child_value_by_description(local_interpretation_grid, "Type", 0)
    selected_models = get_models_by_names(models, get_child_value_by_description(local_interpretation_grid, "Model(s)", 0))
    examples_type_value = get_child_value_by_description(local_interpretation_grid, "Example(s) type:", 0)
    number_of_examples_value = get_child_value_by_description(local_interpretation_grid, "Number of examples:", 0)
    explain = explain_local_interpretation_checkbox.value
    
    for model in selected_models:
        examples = get_test_examples(model, ExampleType[examples_type_value], number_of_examples_value)
        for example in examples:
            with generate_local_interpretation_output:
                print(get_example_information(model, example))
                explanation = explain_single_instance(LocalInterpreterType[type_value], model, example)
                if LocalInterpreterType[type_value] is LocalInterpreterType.LIME:
                    explanation.show_in_notebook(show_table=True, show_all=True)
                elif LocalInterpreterType[type_value] is LocalInterpreterType.SHAP:
                    display(explanation)
                if explain:
                    print(generate_single_instance_explanation(LocalInterpreterType[type_value], model, example)) 

generate_local_interpretation_button.on_click(on_click_generate_local_interpretation_button)