# Exploring LIME - Local Interpretable Model-Agnostic Explanations 

Install first

In [1]:
#!pip install lime

Collecting lime
  Downloading lime-0.1.1.18.tar.gz (191kB)
[K    100% |████████████████████████████████| 194kB 2.5MB/s 
Installing collected packages: lime
  Running setup.py install for lime ... [?25l- done
[?25hSuccessfully installed lime-0.1.1.18


## Get the data from DashDB

In [111]:
import ibmdbpy
from ibmdbpy import IdaDataBase,IdaDataFrame

credentials_1 = {
  'host':'awh-yp-small03.services.dal.bluemix.net',
  'port':'50000',
  'user':'dash110459',
  'password':"""cc7fcfe60374""",
  'database':'BLUDB'
}

idadb = IdaDataBase(dsn="DASHDB;Database=BLUDB;Hostname=" + credentials_1["host"] + ";Port=50000;PROTOCOL=TCPIP;UID=" + credentials_1["user"] + ";PWD=" + credentials_1["password"])
data = IdaDataFrame(idadb, 'DATA_TRAIN')
data = data.as_dataframe()
data.head()

Exception AttributeError: "Cursor instance has no attribute 'closed'" in <bound method Cursor.__del__ of <pypyodbc.Cursor instance at 0x7fd4833bf4d0>> ignored


Unnamed: 0,v01,v02,v03,v04,v05,v06,v07,v08,v09,v10,v11,v12,v13,v14,v15,target
0,-0.581274,0.299583,0.672571,0.879624,1.417787,12273.412584,11.57725,1,0,1,0,2,4,1,3,0
1,0.10236,0.94239,1.358432,1.289167,3.519207,12273.412584,8.978558,1,1,1,2,1,1,2,6,1
2,-0.131057,0.796321,1.218057,1.704464,0.582041,12273.412584,10.950337,1,1,1,2,0,3,3,0,1
3,-1.664699,0.972988,2.604471,1.165724,4.58262,12273.412584,12.14913,1,1,1,1,0,2,3,0,0
4,1.018567,0.796813,1.549261,5.730283,0.854525,12273.412584,8.379034,1,1,0,0,0,3,0,9,1


The data lives in a data frame, however, LIME can only handle it as a matrix. This is a bit inconvenient, so we keep both objects

In [113]:
data_mat=data.values

The labels from the target are extracted to be used later on in the explanation

In [114]:
target = data_mat[:,-1]
le= sklearn.preprocessing.LabelEncoder()
le.fit(target)
target = le.transform(target)
class_names = le.classes_
data_mat = data_mat[:,:-1]
class_names

array([ 0.,  1.])

Since the data is not a data frame, it requires to have a separate vector with the predictor names

In [116]:
feature_names = list(data.columns.drop('target'))
feature_names

['v01',
 'v02',
 'v03',
 'v04',
 'v05',
 'v06',
 'v07',
 'v08',
 'v09',
 'v10',
 'v11',
 'v12',
 'v13',
 'v14',
 'v15']

From the set of predictors, the categorical predictors need special treatment. The next code identifies the indices of the categorical features.

In [119]:
import numpy as np
categorical_features = np.arange(8,15)
categorical_features

array([ 8,  9, 10, 11, 12, 13, 14])

For each categorical feature, the categories are extracted to be used in the explanation later on.

In [120]:
import sklearn

categorical_names = {}
for feature in categorical_features:
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(data_mat[:, feature])
    data_tr[:, feature] = le.transform(data_mat[:, feature])
    categorical_names[feature] = le.classes_
categorical_names

{8: array([ 0.,  1.]),
 9: array([ 0.,  1.]),
 10: array([ 0.,  1.,  2.]),
 11: array([ 0.,  1.,  2.]),
 12: array([ 0.,  1.,  2.,  3.,  4.]),
 13: array([ 0.,  1.,  2.,  3.,  4.]),
 14: array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])}

Since the ML algos do not handle categorical features, the categorical features are dummy codes

In [121]:
encoder = sklearn.preprocessing.OneHotEncoder(categorical_features=categorical_features)
encoder.fit(data_mat)

Creation of the training and test set

In [126]:
np.random.seed(1)
train, test, target_train, target_test = sklearn.model_selection.train_test_split(data_mat, target, train_size=0.80)

The is the step where the categorical features are actually transformed into dummies

In [129]:
encoded_train = encoder.transform(train)
encoded_test = encoder.transform(test)

Run Extreme boosting.

In [135]:
import xgboost
from sklearn.metrics import accuracy_score,roc_curve, auc
import matplotlib.pyplot as plt
%matplotlib inline

xgb = xgboost.XGBClassifier(nthread=10,n_estimators=300, max_depth=5)
xgb.fit(encoded_train, target_train)

y_pred_xgb=xgb.predict_proba(encoded_test)[:,1]

fpr_xgb, tpr_xgb, _ = roc_curve(target_test, y_pred_xgb)
print('AUC XGB=',auc(fpr_xgb, tpr_xgb))



('AUC XGB=', 0.99591435301741316)


Create a scoring function. Note that on the fly, the input data is encoded to dummies.

In [136]:
predict_fn = lambda x: xgb.predict_proba(encoder.transform(x)).astype(float)

Initialiation of the explainer

In [137]:
explainer = lime.lime_tabular.LimeTabularExplainer(train ,feature_names = feature_names,class_names=class_names,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3)

Take a random case, build the explanation model around it and explain. 

In [139]:
i = 42
exp = explainer.explain_instance(test[i], predict_fn, num_features=5)
exp.show_in_notebook(show_all=False)