<a href="https://colab.research.google.com/github/CALDISS-AAU/sdsphd19_coursematerials/blob/master/notebooks/SDS_PHD_Explainable_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Standard stuff
import pandas as pd #for manipulating data
import numpy as np #for manipulating data
import sklearn #for building models

# Dataviz
import matplotlib.pyplot as plt #for custom graphs at the end
import seaborn as sns #for custom graphs at the end

# Other tooling
import os #needed to use Environment Variables in Domino
import time #some of the routines take a while so we monitor the time

# SML
import xgboost as xgb #for building models
import sklearn.ensemble #for building models
from sklearn.model_selection import train_test_split #for creating a hold-out sample
from sklearn import datasets # Boston Housing Data

# Explainable ML&AI tools
!pip install lime
import lime #LIME package
import lime.lime_tabular #the type of LIIME analysis we’ll do
!pip install shap
import shap #SHAP package
import yellowbrick as yb
!pip install pdpbox
from pdpbox import pdp

# Model

In [0]:
from sklearn import datasets

wine_data = datasets.load_wine()
df_wine = pd.DataFrame(wine_data.data,columns=wine_data.feature_names)
df_wine['target'] = pd.Series(wine_data.target)

In [0]:
from sklearn.model_selection import train_test_split
X = df_wine.drop(['target'], axis=1)
y = df_wine['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [0]:
from yellowbrick.features import Rank2D
import matplotlib.pyplot as plt
visualizer = Rank2D(algorithm="pearson",  size=(1080, 720))
visualizer.fit_transform(X_train)
visualizer.poof()

In [0]:
from yellowbrick.classifier import ClassificationReport
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
visualizer = ClassificationReport(model, size=(1080, 720))
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.poof()

# ELI5

In [0]:
import eli5
eli5.show_weights(model, feature_names = X.columns.tolist())

In [0]:
from eli5 import show_prediction
show_prediction(model, X_train.iloc[1], feature_names = X.columns.tolist(), 
                show_feature_values=True)

# Lime

In [0]:
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,                                            
                 feature_names=X_train.columns.values.tolist(),                                        
                 class_names=y_train.unique())

In [0]:
predict_fn = lambda x: model.predict_proba(x).astype(float)

In [0]:
exp = explainer.explain_instance(X_test.values[1], predict_fn, num_features=6)
exp.show_in_notebook(show_all=False)

# ML extend

In [0]:
!pip install mlxtend

In [0]:
from mlxtend.plotting import plot_decision_regions
from mlxtend.classifier import EnsembleVoteClassifier

import matplotlib.gridspec as gridspec
import itertools 
from sklearn import model_selection

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [0]:
X_train_ml = X_train[['proline', 'color_intensity']].values
y_train_ml = y_train.values

In [0]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[1,1,1])
value=1.5
width=0.75
gs = gridspec.GridSpec(2,2)
fig = plt.figure(figsize=(10,8))
labels = ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensemble']
for clf, lab, grd in zip([clf1, clf2, clf3, eclf],
                         labels,
                         itertools.product([0, 1], repeat=2)):
                         
    clf.fit(X_train_ml, y_train_ml)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=X_train_ml, y=y_train_ml, clf=clf)
    plt.title(lab)