File created by following https://marcotcr.github.io/lime/tutorials/Tutorial%20-%20continuous%20and%20categorical%20features.html

## Import

In [None]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.ensemble
import lime
import lime.lime_tabular

# util
from astrapia import load_adult as la

## Load Dataset

To run the following lines, make sure you have run `setup_adult.py` under the `data/adult/` to get the necessary datasets.

In [None]:
data = la.load_csv_data('adult', root_path='../data')
data.data

In [None]:
def preprocess(*data_df): 
    def process_single(df):
        
        cat_df = pd.get_dummies(df, columns=data.categorical_features.keys())
        missing_cols = {cat+'_'+str(attr) for cat in data.categorical_features \
                        for attr in data.categorical_features[cat]} - set(cat_df.columns)
        for c in missing_cols:
            cat_df[c] = 0
            
        cont_idx = list(set(data.data.keys()) - set(data.categorical_features.keys()))
        cat_idx = [cat+'_'+str(attr) for cat in data.categorical_features \
                   for attr in data.categorical_features[cat]]
        idx = cont_idx + cat_idx
        return cat_df[idx]
        
    # Preprocess function for one-hot encoding categorical data
    return [process_single(df) for df in data_df]

In [None]:
train, dev, test = preprocess(data.data, data.data_dev, data.data_test)
labels_train, labels_dev, labels_test = data.target, data.target_dev, data.target_test

## Train RF Classifier

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train, labels_train.to_numpy().reshape(-1))

In [None]:
y_pred = rf.predict(dev)

In [None]:
y_true = labels_dev.to_numpy().reshape(-1)

In [None]:
print('Classification report')
print('{:->60}'.format(''))
print(sklearn.metrics.classification_report(y_true, y_pred))

## Explain instances

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(train, feature_names=train.keys(),
                                                   class_names=data.target_names,
                                                   discretize_continuous=False)

### Instance 1

In [None]:
i = np.random.randint(0, test.shape[0]) # get random sample
exp = explainer.explain_instance(test.iloc[i], rf.predict_proba, num_features=10)

In [None]:
print('True label:', data.target.iloc[i]['income'])
print('{:->60}'.format(''))
print(data.data.iloc[i])
print('{:->60}'.format(''))
exp.show_in_notebook(show_table=True, show_all=False, show_predicted_value=True)
print('{:->60}'.format(''))
fig = exp.as_pyplot_figure()

### Instance 2

In [None]:
i = 0
exp = explainer.explain_instance(test.iloc[i], rf.predict_proba, num_features=10)

In [None]:
print('True label:', data.target.iloc[i]['income'])
print('{:->60}'.format(''))
print(data.data.iloc[i])
print('{:->60}'.format(''))
exp.show_in_notebook(show_table=True, show_all=False, show_predicted_value=True)
print('{:->60}'.format(''))
fig = exp.as_pyplot_figure()

### Instance 3

In [None]:
#i = np.random.randint(0, test.shape[0])# get another random sample
exp = explainer.explain_instance(test.iloc[i], rf.predict_proba, num_features=10)

In [None]:
print('True label:', data.target.iloc[i]['income'])
print('{:->60}'.format(''))
print(data.data.iloc[i])
print('{:->60}'.format(''))
exp.show_in_notebook(show_table=True, show_all=False, show_predicted_value=True)
print('{:->60}'.format(''))
fig = exp.as_pyplot_figure()