In [11]:
from sklearn.tree import DecisionTreeClassifier
from dataclasses import dataclass
from typing import List, Optional, Tuple
from dataclasses import asdict
import json

In [12]:
from utils import extract_tree_structure, save_tree_to_json, TreeNode

In [13]:
from lore_sa.dataset import TabularDataset
import pandas as pd

In [14]:
#clean dataset for lore

# Load dataset
df = pd.read_csv("test_dataset.csv")

# Define dataset features
df_features = ['engine_age', 'length', 'power', 'month', 'weight', 'y_month',
       'year', 'surf_temp']
# Define category labels
labels = [
    "Poor Session", "Below Average", "Average Session", "Above Average", 
    "Good Session", "Great Session", "Excellent Session", 
    "Outstanding", "Legendary", "Epic"
]

# Define features and target variable
y = df["value"]
X = df[df_features]


# Split y into 10 categories
y = pd.cut(y, bins=10, labels=labels)

final_df = X.copy()
final_df["value"] = y

# Save to CSV
final_df.to_csv("clean dataset for LORE.csv", index=False)

In [15]:
dataset = TabularDataset.from_csv('clean dataset for LORE.csv', class_name = "value")
dataset.df.dropna(inplace = True)
dataset.df

2025-02-10 15:16:40,760 root         INFO     clean dataset for LORE.csv file imported


Unnamed: 0,engine_age,length,power,month,weight,y_month,year,surf_temp,value
0,10.0,10.50,367.0,44,5.0,8,3.0,286.90,Poor Session
1,26.0,21.30,970.0,131,19.0,11,10.0,276.25,Poor Session
2,32.0,12.13,190.0,170,0.0,2,14.0,279.03,Poor Session
3,8.0,27.45,1014.0,89,0.0,5,7.0,282.22,Poor Session
4,31.0,9.22,80.0,216,12.5,0,18.0,284.25,Poor Session
...,...,...,...,...,...,...,...,...,...
1601865,15.0,14.99,300.0,61,0.0,1,5.0,279.86,Poor Session
1601866,34.0,19.01,370.0,39,0.0,3,3.0,279.82,Poor Session
1601867,3.0,7.00,144.0,124,1.5,4,10.0,287.35,Poor Session
1601868,9.0,14.16,291.0,121,0.0,1,10.0,279.31,Poor Session


In [16]:
dataset.update_descriptor()

In [17]:
dataset.descriptor

{'numeric': {'engine_age': {'index': 0,
   'min': 0.0,
   'max': 120.0,
   'mean': 24.548608028926612,
   'std': 13.54915849085067,
   'median': 25.0,
   'q1': 15.0,
   'q3': 33.0},
  'length': {'index': 1,
   'min': 3.7,
   'max': 94.32,
   'mean': 15.44404625109093,
   'std': 13.587122314981222,
   'median': 10.55,
   'q1': 9.04,
   'q3': 14.65},
  'power': {'index': 2,
   'min': 0.0,
   'max': 11000.0,
   'mean': 516.7220143302916,
   'std': 1162.8706193281064,
   'median': 152.0,
   'q1': 80.0,
   'q3': 340.0},
  'month': {'index': 3,
   'min': 37,
   'max': 221,
   'mean': 121.89682542914188,
   'std': 53.928536447105905,
   'median': 118.0,
   'q1': 74.0,
   'q3': 168.0},
  'weight': {'index': 4,
   'min': 0.0,
   'max': 2490343.0,
   'mean': 6822.974977855779,
   'std': 71718.11132857874,
   'median': 0.05,
   'q1': 0.0,
   'q3': 67.0},
  'y_month': {'index': 5,
   'min': 0,
   'max': 11,
   'mean': 5.449684497415037,
   'std': 3.352452423506836,
   'median': 5.0,
   'q1': 3.0,


In [18]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from lore_sa.bbox import sklearn_classifier_bbox

In [19]:
# improved train_model method from the tutorial 
def train_model(dataset: TabularDataset):
    numeric_indices = [v['index'] for k, v in dataset.descriptor['numeric'].items()]
    categorical_indices = [v['index'] for k, v in dataset.descriptor['categorical'].items()]

    # Create preprocessor using dynamic indices
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_indices),
            ('cat', OrdinalEncoder(), categorical_indices)
        ]
    )

    # Remove rare classes with fewer than 2 instances
    valid_classes = dataset.df['value'].value_counts()[dataset.df['value'].value_counts() > 1].index
    dataset.df = dataset.df[dataset.df['value'].isin(valid_classes)]

    # Select features and target
    X = dataset.df.iloc[:, numeric_indices + categorical_indices]  # Select all features
    y = dataset.df['value']
    
    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                test_size=0.3, random_state=42, stratify=y)

    model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
    
    model.fit(X_train, y_train)
    
    return sklearn_classifier_bbox.sklearnBBox(model)

In [None]:
bbox = train_model(dataset)

In [29]:
bbox.model

<bound method AbstractBBox.model of <lore_sa.bbox.sklearn_classifier_bbox.sklearnBBox object at 0x0000026245463EE0>>

In [21]:
from lore_sa.lore import TabularRandomGeneratorLore

tabularLore = TabularRandomGeneratorLore(bbox, dataset)

In [22]:
num_row = 10
x = dataset.df.iloc[num_row][:-1] # we exclude the target feature
# when
explanation = tabularLore.explain(x)
# then
print(explanation)

{'rule': {'premises': [{'attr': 'weight', 'val': 1355313.3125, 'op': '>'}, {'attr': 'power', 'val': 4368.33447265625, 'op': '<='}, {'attr': 'power', 'val': 3376.3662109375, 'op': '>'}, {'attr': 'month', 'val': 148.65327072143555, 'op': '>'}, {'attr': 'year', 'val': 4.798688173294067, 'op': '>'}, {'attr': 'surf_temp', 'val': 281.4221649169922, 'op': '>'}, {'attr': 'length', 'val': 58.022499084472656, 'op': '<='}], 'consequence': {'attr': 'value', 'val': 'Poor Session', 'op': '='}}, 'counterfactuals': [{'premises': [{'attr': 'weight', 'val': 1355313.3125, 'op': '>'}, {'attr': 'power', 'val': 4368.33447265625, 'op': '>'}, {'attr': 'month', 'val': 97.8215103149414, 'op': '>'}, {'attr': 'year', 'val': 17.106593132019043, 'op': '<='}, {'attr': 'year', 'val': 4.798688173294067, 'op': '>'}, {'attr': 'length', 'val': 10.610825061798096, 'op': '>'}, {'attr': 'surf_temp', 'val': 278.7205352783203, 'op': '>'}], 'consequence': {'attr': 'value', 'val': 'Below Average', 'op': '='}}, {'premises': [{'a

In [23]:
from lore_sa.encoder_decoder import ColumnTransformerEnc

tabular_enc = ColumnTransformerEnc(dataset.descriptor)
ref_value = dataset.df.iloc[num_row].values[:-1]
encoded = tabular_enc.encode([ref_value])
decoded = tabular_enc.decode(encoded)

print(f"Original value: {ref_value}")
print(f"Encoded value: {encoded}")
print(f"Decoded value: {decoded}")

Original value: [35.0 10.62 131.0 143 0.0 11 11.0 278.45]
Encoded value: [[35.0 10.62 131.0 143 0.0 11 11.0 278.45]]
Decoded value: [[35.0 10.62 131.0 143 0.0 11 11.0 278.45]]


In [24]:
from lore_sa.neighgen import RandomGenerator

num_row = 10
x = dataset.df.iloc[num_row][:-1]
z = tabular_enc.encode([x.values])[0] # remove the class feature from the input instance

gen = RandomGenerator(bbox=bbox, dataset=dataset, encoder=tabular_enc, ocr=0.1)
neighbour = gen.generate(z, 100, dataset.descriptor, tabular_enc)

print('Neighborhood', neighbour)

Neighborhood [[35.0 10.62 4149.653115699672 143 0.0 11 11.0 278.45]
 [35.0 10.62 4149.653115699672 143 0.0 11 11.0 278.45]
 [35.0 10.62 4149.653115699672 186.16580097459854 0.0 11 11.0 278.45]
 [35.0 10.62 4149.653115699672 186.16580097459854 0.0 11
  6.565535857282923 278.45]
 [35.0 10.62 4149.653115699672 186.16580097459854 0.0 11
  6.565535857282923 278.45]
 [35.0 10.62 4149.653115699672 186.16580097459854 0.0 11
  12.216227961026815 278.45]
 [35.0 10.62 4021.6511243031173 186.16580097459854 0.0 11
  12.216227961026815 278.45]
 [6.773353704770186 16.058733270543023 4739.2213143598365
  186.16580097459854 0.0 11 12.216227961026815 278.45]
 [6.773353704770186 16.058733270543023 4739.2213143598365
  186.16580097459854 0.0 11 12.216227961026815 278.45]
 [6.773353704770186 16.058733270543023 5877.646147251473
  186.16580097459854 0.0 11 12.216227961026815 278.45]
 [6.773353704770186 16.058733270543023 5877.646147251473
  186.16580097459854 0.0 11 4.304935317436909 278.45]
 [6.77335370477

In [25]:
from lore_sa.surrogate import DecisionTreeSurrogate
# decode the neighborhood to be labeled by the blackbox model
neighb_train_X = tabular_enc.decode(neighbour)
neighb_train_y = bbox.predict(neighb_train_X)
# encode the target class to the surrogate model
neighb_train_yz = tabular_enc.encode_target_class(neighb_train_y.reshape(-1, 1)).squeeze()

dt = DecisionTreeSurrogate()
x = dt.train(neighbour, neighb_train_yz)

In [26]:
num_row = 10
x = dataset.df.iloc[num_row][:-1] # remove the class feature from the input instance
z = tabular_enc.encode([x.values])[0]
rule = dt.get_rule(z, tabular_enc)
print('rule', rule)
crules, deltas = dt.get_counterfactual_rules(z, neighbour, neighb_train_yz, tabular_enc)
print('\n crules')
for c in crules:
    print(c)


rule premises:
power <= 7473.489990234375
surf_temp <= 292.7982482910156
surf_temp > 274.5857696533203
month <= 205.19869995117188 
consequence: value = Poor Session

 crules
premises:
power <= 7473.489990234375
surf_temp <= 292.7982482910156
month > 205.19869995117188 
consequence: value = Below Average


In [27]:
save_tree_to_json(
       extract_tree_structure(
              dt.get_tree_structure(), feature_names=df_features, target_names=labels
       ), filename="loreTree.json"
)