In [None]:
from __future__ import print_function
import numpy as np
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
from anchor import utils
from anchor import anchor_tabular

### Loading the dataset
This dataset is about predicting if a person makes more or less than 50,000 dollars

In [None]:
# make sure you have adult/adult.data inside dataset_folder
dataset_folder = '../data/'
dataset = utils.load_dataset('adult', balance=True, dataset_folder=dataset_folder, discretize=True)

Let's train a classifier for this

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
rf.fit(dataset.train, dataset.labels_train)
print('Train', sklearn.metrics.accuracy_score(dataset.labels_train, rf.predict(dataset.train)))
print('Test', sklearn.metrics.accuracy_score(dataset.labels_test, rf.predict(dataset.test)))

### Getting an anchor

Now let's start the explainer. We need the training data to perturb instances.
`categorical_names` is a map from integer to list of strings, containing names for each
            value of the categorical features. Every feature that is not in
            this map will be considered as ordinal or continuous, and thus discretized.


In [None]:
explainer = anchor_tabular.AnchorTabularExplainer(
    dataset.class_names,
    dataset.feature_names,
    dataset.train,
    dataset.categorical_names)

Below, we get an anchor for prediction number 0. An anchor is a sufficient condition - that is, when the anchor holds, the prediction should be the same as the prediction for this instance.

In [None]:
idx = 0
np.random.seed(1)
print('Prediction: ', explainer.class_names[rf.predict(dataset.test[idx].reshape(1, -1))[0]])
exp = explainer.explain_instance(dataset.test[idx], rf.predict, threshold=0.95)

In [None]:
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print('Coverage: %.2f' % exp.coverage())

Note that we set threshold to 0.95, so we guarantee (with high probability) that precision will be above 0.95 - that is, that predictions on instances where the anchor holds will be the same as the original prediction at least 95% of the time. Let's try it out on the test set

In [None]:
# Get test examples where the anchora pplies
fit_anchor = np.where(np.all(dataset.test[:, exp.features()] == dataset.test[idx][exp.features()], axis=1))[0]
print('Anchor test precision: %.2f' % (np.mean(rf.predict(dataset.test[fit_anchor]) == rf.predict(dataset.test[idx].reshape(1, -1)))))
print('Anchor test coverage: %.2f' % (fit_anchor.shape[0] / float(dataset.test.shape[0])))

### Looking at a partial anchor
You can look at just part of the anchor - for example, the first two clauses. Note how these do not have enough precision, which is why the explainer added a third one

In [None]:
print('Partial anchor: %s' % (' AND '.join(exp.names(1))))
print('Partial precision: %.2f' % exp.precision(1))
print('Partial coverage: %.2f' % exp.coverage(1))


In [None]:
fit_partial = np.where(np.all(dataset.test[:, exp.features(1)] == dataset.test[idx][exp.features(1)], axis=1))[0]
print('Partial anchor test precision: %.2f' % (np.mean(rf.predict(dataset.test[fit_partial]) == rf.predict(dataset.test[idx].reshape(1, -1)))))
print('Partial anchor test coverage: %.2f' % (fit_partial.shape[0] / float(dataset.test.shape[0])))


### See a visualization of the anchor with examples and etc (won't work if you're seeing this on github)

In [None]:
exp.show_in_notebook()

## What if I'm using an encoder, e.g. OneHot?

There are two options when using an encoder, as detailed below:

### Option 1: fold the encoding into the prediction function

In [None]:
encoder = sklearn.preprocessing.OneHotEncoder()
encoder.fit(dataset.train)

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
rf.fit(encoder.transform(dataset.train), dataset.labels_train)
predict_fn = lambda x: rf.predict(encoder.transform(x))
print('Train', sklearn.metrics.accuracy_score(dataset.labels_train, predict_fn(dataset.train)))
print('Test', sklearn.metrics.accuracy_score(dataset.labels_test, predict_fn(dataset.test)))


Note that our predict function here takes the original data and encodes it before passing it to the Random Forest.
In this case, getting an explanation is the same as before, except that we use `predict_fn` rather than `c.predict`:

In [None]:
idx = 0
np.random.seed(1)
print('Prediction: ', explainer.class_names[predict_fn(dataset.test[idx].reshape(1, -1))[0]])
exp = explainer.explain_instance(dataset.test[idx], predict_fn, threshold=0.95)

In [None]:
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print('Coverage: %.2f' % exp.coverage())

In [None]:
exp.show_in_notebook()

### Option 2: use the `encoder_fn` param 

You can initialize the anchor explainer with the optional `encoder_fn` param, which will be called before every call to the prediction function

In [None]:
explainer = anchor_tabular.AnchorTabularExplainer(dataset.class_names, dataset.feature_names, dataset.train, dataset.categorical_names,
                                                 encoder_fn=encoder.transform)

Now we can get an explanation using `c.predict` again

In [None]:
idx = 0
np.random.seed(1)
print('Prediction: ', explainer.class_names[predict_fn(dataset.test[idx].reshape(1, -1))[0]])
exp = explainer.explain_instance(dataset.test[idx], rf.predict, threshold=0.95)

In [None]:
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print('Coverage: %.2f' % exp.coverage())

In [None]:
exp.show_in_notebook()