The aim of this notebook will be to find the following data:
* decision tree to plot in the visualization
* the generated neighbourhood generated by LORE

Everything will be tested on the iris dataset for quick running times

### dataset import

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import json
from sklearn.preprocessing import LabelEncoder

In [2]:
data_file = "iris.json"
# Load and prepare data
data = load_iris()
X = data.data
y = data.target
feature_names = list(data.feature_names)
target_names = list(data.target_names)

### LORE initial

read data

In [3]:
from lore_sa.dataset import TabularDataset
import pandas as pd

In [4]:
data_dict = {name: X[:, i] for i, name in enumerate(feature_names)}
target_name = 'target'
data_dict[target_name] = [target_names[i] for i in y]  # Map numerical targets to names

dataset = TabularDataset.from_dict(data_dict, 'target')
dataset.df.dropna(inplace = True)
dataset.df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


train model

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from lore_sa.bbox import sklearn_classifier_bbox

In [6]:

def tutorial_train_model_generalized(dataset: TabularDataset, target_name:str, ):
    numeric_indices = [v['index'] for k, v in dataset.descriptor['numeric'].items()]
    categorical_indices = [v['index'] for k, v in dataset.descriptor['categorical'].items()]

    # Create preprocessor using dynamic indices
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_indices),
            ('cat', OrdinalEncoder(), categorical_indices)
        ]
    )

    # Remove rare classes with fewer than 2 instances
    valid_classes = dataset.df[target_name].value_counts()[dataset.df[target_name].value_counts() > 1].index
    dataset.df = dataset.df[dataset.df[target_name].isin(valid_classes)]

        # Select features and target
    X = dataset.df.iloc[:, numeric_indices + categorical_indices]  # Select all features
    y = dataset.df[target_name]

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                test_size=0.3, random_state=42, stratify=y)

    model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
    
    model.fit(X_train, y_train)
    
    return sklearn_classifier_bbox.sklearnBBox(model)

In [7]:
bbox = tutorial_train_model_generalized(dataset, target_name)

### encoding decoding

In [8]:
from lore_sa.encoder_decoder import ColumnTransformerEnc

tabular_enc = ColumnTransformerEnc(dataset.descriptor)
ref_value = dataset.df.iloc[0].values[:-1]
encoded = tabular_enc.encode([ref_value])
decoded = tabular_enc.decode(encoded)

print(f"Original value: {ref_value}")
print(f"Encoded value: {encoded}")
print(f"Decoded value: {decoded}")

Original value: [5.1 3.5 1.4 0.2]
Encoded value: [[5.1 3.5 1.4 0.2]]
Decoded value: [[5.1 3.5 1.4 0.2]]


the encoder is created using the dataset.descriptor, which is a dict so it can be saved to json and the encoder is then created based on the content read from it

In [9]:
dataset.descriptor

{'numeric': {'sepal length (cm)': {'index': 0,
   'min': 4.3,
   'max': 7.9,
   'mean': 5.843333333333334,
   'std': 0.828066127977863,
   'median': 5.8,
   'q1': 5.1,
   'q3': 6.4},
  'sepal width (cm)': {'index': 1,
   'min': 2.0,
   'max': 4.4,
   'mean': 3.0573333333333337,
   'std': 0.4358662849366982,
   'median': 3.0,
   'q1': 2.8,
   'q3': 3.3},
  'petal length (cm)': {'index': 2,
   'min': 1.0,
   'max': 6.9,
   'mean': 3.7580000000000005,
   'std': 1.7652982332594662,
   'median': 4.35,
   'q1': 1.6,
   'q3': 5.1},
  'petal width (cm)': {'index': 3,
   'min': 0.1,
   'max': 2.5,
   'mean': 1.1993333333333336,
   'std': 0.7622376689603465,
   'median': 1.3,
   'q1': 0.3,
   'q3': 1.8}},
 'categorical': {},
 'ordinal': {},
 'target': {'target': {'index': 4,
   'distinct_values': ['setosa', 'versicolor', 'virginica'],
   'count': {'setosa': 50, 'versicolor': 50, 'virginica': 50}}}}

In [10]:
with open ("descriptor.json", "w") as f:
    json.dump(dataset.descriptor, f, indent=4)

the value to use for the prediction can be found via:

In [11]:
ref_value = dataset.df.iloc[0].values[:-1]
list(ref_value)

[5.1, 3.5, 1.4, 0.2]

and decoded using

In [12]:
encoded = tabular_enc.encode([ref_value])
decoded = tabular_enc.decode(encoded)
list(decoded[0])

[5.1, 3.5, 1.4, 0.2]

### neighborhood generation

In [13]:
from lore_sa.neighgen import RandomGenerator

select the istance to explain

In [14]:
num_row = 0
x = dataset.df.iloc[num_row][:-1]
x

sepal length (cm)    5.1
sepal width (cm)     3.5
petal length (cm)    1.4
petal width (cm)     0.2
Name: 0, dtype: object

encode it

In [15]:
z = tabular_enc.encode([x.values])[0] # remove the class feature from the input instance

creates the neighborhood

In [16]:
gen = RandomGenerator(bbox=bbox, dataset=dataset, encoder=tabular_enc, ocr=0.1)
neighbour = gen.generate(z, 100, dataset.descriptor, tabular_enc)
neighbour

array([[5.1, 3.5, 1.4, 0.2],
       [5.1, 3.5, 1.4, 0.2],
       [5.1, 3.5, 1.4, 0.2],
       [5.1, 2.234239791793862, 1.4, 0.2],
       [5.1, 2.234239791793862, 1.4, 0.24898021589698746],
       [5.1, 2.234239791793862, 1.4, 0.24898021589698746],
       [5.1, 2.234239791793862, 1.4, 0.24898021589698746],
       [5.1, 4.0348023592826685, 1.4, 0.24898021589698746],
       [5.1, 2.0629314350773105, 1.4, 0.24898021589698746],
       [5.1, 2.0629314350773105, 1.4, 0.24898021589698746],
       [5.1, 2.0629314350773105, 1.4, 0.24898021589698746],
       [5.1, 2.0629314350773105, 1.4, 0.24898021589698746],
       [5.1, 2.0629314350773105, 1.4, 0.24898021589698746],
       [5.1, 2.3596781869046777, 1.4, 0.24898021589698746],
       [5.1, 2.3596781869046777, 1.4, 0.24898021589698746],
       [5.1, 3.3885476371322545, 1.4, 0.24898021589698746],
       [5.1, 3.3885476371322545, 1.4, 0.24898021589698746],
       [5.1, 3.3885476371322545, 1.4, 0.24898021589698746],
       [6.100384219593169, 3.3885

### surrogate model

In [17]:
from lore_sa.surrogate import DecisionTreeSurrogate

create X, y and yz for the decision tree surrogate

In [18]:
# decode the neighborhood to be labeled by the blackbox model
neighb_train_X = tabular_enc.decode(neighbour)
neighb_train_y = bbox.predict(neighb_train_X)
# encode the target class to the surrogate model
neighb_train_yz = tabular_enc.encode_target_class(neighb_train_y.reshape(-1, 1)).squeeze()

train the surrogate model

In [19]:
dt = DecisionTreeSurrogate()
x = dt.train(neighbour, neighb_train_yz)

get the decision tree _tree for plotting

In [20]:
dt.get_tree_structure()

<sklearn.tree._tree.Tree at 0x19458fb29d0>

In [21]:
#utils file in decision tree visualization prototype folder
from sklearn.tree import DecisionTreeClassifier
from dataclasses import dataclass
from typing import List, Optional
from dataclasses import asdict
import json
from sklearn.tree._tree import Tree 

@dataclass
class TreeNode:
    """Class to store decision tree node information"""
    # Unique identifier for the node in the tree
    node_id: int
    
    # The name of the feature used for the decision at this node. 
    # If the node is a leaf, this will be `None`.
    feature_name: Optional[str]
    
    # The threshold value for the feature used to split the data at this node. 
    # If the node is a leaf, this will be `None`.
    threshold: Optional[float]
    
    # The node ID of the left child node. If the node is a leaf, this will be `None`.
    left_child: Optional[int]
    
    # The node ID of the right child node. If the node is a leaf, this will be `None`.
    right_child: Optional[int]
    
    # Indicates whether this node is a leaf node (`True` if leaf, `False` if internal).
    is_leaf: bool
    
    # The class label predicted by the leaf node. 
    # Only set if the node is a leaf; otherwise, it is `None`.
    class_label: Optional[str]
    
    # The number of samples (data points) that reached this node during training.
    samples: int

def extract_tree_structure(tree_classifier: DecisionTreeClassifier, feature_names: List[str], target_names: List[str]) -> List[TreeNode]: 
    """
    Extract node information from a trained DecisionTreeClassifier

    Parameters:
    -----------
    tree_classifier : DecisionTreeClassifier
        A trained sklearn DecisionTreeClassifier
    feature_names : List[str]
        A list of feature names
    target_names : List[str]
        A list of target class labels

    Returns:
    --------
    List[TreeNode]
        List of TreeNode objects containing the tree structure
    """
    if isinstance(tree_classifier, DecisionTreeClassifier): #account for LORE DecisionTreeSurrogate class
        tree = tree_classifier.tree_
    else:
        tree = tree_classifier

    nodes = []

    for node_id in range(tree.node_count):
        # Check if node is leaf
        is_leaf = tree.children_left[node_id] == -1

        # Get node information
        if is_leaf:
            # Get the class label based on the majority class in the leaf
            class_label_index = int(tree.value[node_id].argmax())
            class_label = target_names[class_label_index]
            
            node = TreeNode(
                node_id=node_id,
                feature_name=None,
                threshold=None,
                left_child=None,
                right_child=None,
                is_leaf=True,
                class_label=class_label,
                samples=int(tree.n_node_samples[node_id])
            )
        else:
            feature_name = feature_names[int(tree.feature[node_id])]
            threshold = float(tree.threshold[node_id])
            left_child = int(tree.children_left[node_id])
            right_child = int(tree.children_right[node_id])

            node = TreeNode(
                node_id=node_id,
                feature_name=feature_name,
                threshold=threshold,
                left_child=left_child,
                right_child=right_child,
                is_leaf=False,
                class_label=None,
                samples=int(tree.n_node_samples[node_id])
            )

        nodes.append(node)

    return nodes

def save_tree_to_json(nodes, filename: str, indent: int = 4):
    """
    Save the tree structure to a JSON file
    
    Parameters:
    -----------
    nodes : List[TreeNode]
        List of TreeNode objects to save
    filename : str
        Path to save the JSON file
    indent : int
        Number of spaces for indentation
    """
    # Convert TreeNodes to dictionaries
    nodes_dict = [asdict(node) for node in nodes]
    
    # Save to file with indentation
    with open(filename, 'w') as f:
        json.dump(nodes_dict, f, indent=indent)

In [22]:
save_tree_to_json(extract_tree_structure(dt.get_tree_structure(), feature_names=['engine_age', 'length', 'power', 'month', 'weight', 'y_month',
       'year', 'surf_temp'], target_names=target_names), filename="loreTest.json")