In [1]:
from lore_sa.dataset import TabularDataset
import pandas as pd

dataset = TabularDataset.from_csv('adult.csv', class_name = "income")
dataset.df.dropna(inplace = True)
dataset.df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [2]:
dataset.descriptor.keys()

dict_keys(['numeric', 'categorical', 'ordinal', 'target'])

In [3]:
dataset.df.keys()

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [4]:
dataset.df.drop(['fnlwgt', 'educational-num'], inplace=True, axis=1)
dataset.update_descriptor()

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from lore_sa.bbox import sklearn_classifier_bbox


def train_model(dataset: TabularDataset):
    numeric_indices = [v['index'] for v in dataset.descriptor['numeric'].values()]
    categorical_indices = [v['index'] for v in dataset.descriptor['categorical'].values()]
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_indices),
            ('cat', OrdinalEncoder(), categorical_indices)
        ]
    )
    model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
    
    X_train, X_test, y_train, y_test = train_test_split(dataset.df.loc[:, 'age':'native-country'].values, dataset.df['income'].values,
                test_size=0.3, random_state=42, stratify=dataset.df['income'].values)
    model.fit(X_train, y_train)
    
    return sklearn_classifier_bbox.sklearnBBox(model)

In [6]:
bbox = train_model(dataset)

In [7]:
from lore_sa.lore import TabularGeneticGeneratorLore

tabularLore = TabularGeneticGeneratorLore(bbox, dataset)

In [9]:
import json
import numpy as np

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [12]:
tabularLore.surrogate

<lore_sa.surrogate.decision_tree.DecisionTreeSurrogate at 0x214c2b70670>

In [10]:
num_row = 10
x = dataset.df.iloc[num_row][:-1] # we exclude the target feature
# when
explanation = tabularLore.explain(x)
# then
import json
print(json.dumps(explanation, indent=4, cls=NpEncoder))

{
    "rule": {
        "premises": [
            {
                "attr": "age",
                "val": 35.0,
                "op": "<="
            },
            {
                "attr": "capital-gain",
                "val": 12501.0,
                "op": "<="
            },
            {
                "attr": "capital-loss",
                "val": 2374.0,
                "op": "<="
            }
        ],
        "consequence": {
            "attr": "income",
            "val": "<=50K",
            "op": "="
        }
    },
    "counterfactuals": [
        {
            "premises": [
                {
                    "attr": "age",
                    "val": 35.0,
                    "op": "<="
                },
                {
                    "attr": "capital-gain",
                    "val": 12501.0,
                    "op": ">"
                }
            ],
            "consequence": {
                "attr": "income",
                "val": ">50K",
       

In [None]:
from lore_sa.encoder_decoder import ColumnTransformerEnc

tabular_enc = ColumnTransformerEnc(dataset.descriptor)
ref_value = dataset.df.iloc[0].values[:-1]
encoded = tabular_enc.encode([ref_value])
decoded = tabular_enc.decode(encoded)

print(f"Original value: {ref_value}")
print(f"Encoded value: {encoded}")
print(f"Decoded value: {decoded}")

#### Neighborhood generation
Now that we are able to encode and decode the dataset, we can generate the neighborhood of the instance to be explained. To personalize the neighborhood generation process, we create now a Genetic Generator, using the income `GeneticGenerator`. The neighborhood is a synthetic dataset that is created by generating random instances around the instance to be explained, refined with a genetic algorithm to obtain a more dense and compact neighborhood.

In [None]:
from lore_sa.neighgen import RandomGenerator

num_row = 10
x = dataset.df.iloc[num_row][:-1]
z = tabular_enc.encode([x.values])[0] # remove the income feature from the input instance

gen = RandomGenerator(bbox=bbox, dataset=dataset, encoder=tabular_enc, ocr=0.1)
neighbour = gen.generate(z, 100, dataset.descriptor, tabular_enc)

print('Neighborhood', neighbour)

## surrogate model

The `surrogate` submodule creates a classifier and provides the methods to extract its corresponding classification rules. Once trained the surrogate, we extract a set of rules that explains why the surrogate model classifies the instance in a certain way. 

In the following example, the instance `x` has been classified as an adult with income less than 50k. The surrogate model has used the features `capital-gain`, `capital-loss`, `marital-status`, and `native-country`. 

In [None]:
from lore_sa.surrogate import DecisionTreeSurrogate
# decode the neighborhood to be labeled by the blackbox model
neighb_train_X = tabular_enc.decode(neighbour)
neighb_train_y = bbox.predict(neighb_train_X)
# encode the target income to the surrogate model
neighb_train_yz = tabular_enc.encode_target_class(neighb_train_y.reshape(-1, 1)).squeeze()

dt = DecisionTreeSurrogate()
dt.train(neighbour, neighb_train_yz)

In [None]:
num_row = 10
x = dataset.df.iloc[num_row][:-1] # remove the income feature from the input instance
z = tabular_enc.encode([x.values])[0]
rule = dt.get_rule(z, tabular_enc)
print('rule', rule)
crules, deltas = dt.get_counterfactual_rules(z, neighbour, neighb_train_yz, tabular_enc)
print('\n crules')
for c in crules:
    print(c)
