In [29]:
from sklearn.tree import DecisionTreeClassifier
from dataclasses import dataclass
from typing import List, Optional, Tuple
from dataclasses import asdict
import json

In [30]:
from utils import extract_tree_structure, save_tree_to_json, TreeNode

In [31]:
from lore_sa.dataset import TabularDataset
import pandas as pd

In [32]:
#clean dataset for lore
import pandas as pd

# Load dataset
df = pd.read_csv("test_dataset.csv")

# Define features and target variable
y = df["value"]
X = df[['engine_age', 'length', 'power', 'month', 'weight', 'y_month',
       'year', 'surf_temp']]

# Define category labels
labels = [
    "Poor Session", "Below Average", "Average Session", "Above Average", 
    "Good Session", "Great Session", "Excellent Session", 
    "Outstanding", "Legendary", "Epic"
]

# Split y into 10 categories
y = pd.cut(y, bins=10, labels=labels)

final_df = X.copy()
final_df["value"] = y

# Save to CSV
final_df.to_csv("clean dataset for LORE.csv", index=False)

In [33]:
dataset = TabularDataset.from_csv('clean dataset for LORE.csv', class_name = "value")
dataset.df.dropna(inplace = True)
dataset.df

2025-02-10 12:34:30,501 root         INFO     clean dataset for LORE.csv file imported


Unnamed: 0,engine_age,length,power,month,weight,y_month,year,surf_temp,value
0,10.0,10.50,367.0,44,5.0,8,3.0,286.90,Poor Session
1,26.0,21.30,970.0,131,19.0,11,10.0,276.25,Poor Session
2,32.0,12.13,190.0,170,0.0,2,14.0,279.03,Poor Session
3,8.0,27.45,1014.0,89,0.0,5,7.0,282.22,Poor Session
4,31.0,9.22,80.0,216,12.5,0,18.0,284.25,Poor Session
...,...,...,...,...,...,...,...,...,...
1601865,15.0,14.99,300.0,61,0.0,1,5.0,279.86,Poor Session
1601866,34.0,19.01,370.0,39,0.0,3,3.0,279.82,Poor Session
1601867,3.0,7.00,144.0,124,1.5,4,10.0,287.35,Poor Session
1601868,9.0,14.16,291.0,121,0.0,1,10.0,279.31,Poor Session


In [34]:
dataset.descriptor.keys()

dict_keys(['numeric', 'categorical', 'ordinal', 'target'])

In [35]:
dataset.update_descriptor()

In [36]:
dataset.descriptor

{'numeric': {'engine_age': {'index': 0,
   'min': 0.0,
   'max': 120.0,
   'mean': 24.548608028926612,
   'std': 13.54915849085067,
   'median': 25.0,
   'q1': 15.0,
   'q3': 33.0},
  'length': {'index': 1,
   'min': 3.7,
   'max': 94.32,
   'mean': 15.44404625109093,
   'std': 13.587122314981222,
   'median': 10.55,
   'q1': 9.04,
   'q3': 14.65},
  'power': {'index': 2,
   'min': 0.0,
   'max': 11000.0,
   'mean': 516.7220143302916,
   'std': 1162.8706193281064,
   'median': 152.0,
   'q1': 80.0,
   'q3': 340.0},
  'month': {'index': 3,
   'min': 37,
   'max': 221,
   'mean': 121.89682542914188,
   'std': 53.928536447105905,
   'median': 118.0,
   'q1': 74.0,
   'q3': 168.0},
  'weight': {'index': 4,
   'min': 0.0,
   'max': 2490343.0,
   'mean': 6822.974977855779,
   'std': 71718.11132857874,
   'median': 0.05,
   'q1': 0.0,
   'q3': 67.0},
  'y_month': {'index': 5,
   'min': 0,
   'max': 11,
   'mean': 5.449684497415037,
   'std': 3.352452423506836,
   'median': 5.0,
   'q1': 3.0,


In [37]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from lore_sa.bbox import sklearn_classifier_bbox

In [50]:
def train_model(dataset: TabularDataset, num_numeric_features:int, targetClass:str):
    numeric_indices = [v['index'] for k, v in dataset.descriptor['numeric'].items()]
    categorical_indices = [v['index'] for k, v in dataset.descriptor['categorical'].items()]

    # Create preprocessor using dynamic indices
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_indices),
            ('cat', OrdinalEncoder(), categorical_indices)
        ]
    )

    # Remove rare classes with fewer than 2 instances
    valid_classes = dataset.df['value'].value_counts()[dataset.df['value'].value_counts() > 1].index
    dataset.df = dataset.df[dataset.df['value'].isin(valid_classes)]

    # Select features and target
    X = dataset.df.iloc[:, numeric_indices + categorical_indices]  # Select all features
    y = dataset.df['value']
    
    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                test_size=0.3, random_state=42, stratify=y)

    model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
    
    model.fit(X_train, y_train)
    
    return sklearn_classifier_bbox.sklearnBBox(model)

In [51]:
bbox = train_model(dataset, len(dataset.descriptor['numeric']), 'value')

In [52]:
from lore_sa.lore import TabularRandomGeneratorLore

tabularLore = TabularRandomGeneratorLore(bbox, dataset)

In [56]:
num_row = 10
x = dataset.df.iloc[num_row][:-1] # we exclude the target feature
# when
explanation = tabularLore.explain(x)
# then
print(explanation)

{'rule': {'premises': [{'attr': 'weight', 'val': 1454706.125, 'op': '>'}, {'attr': 'power', 'val': 10381.05224609375, 'op': '<='}, {'attr': 'power', 'val': 5358.968505859375, 'op': '>'}, {'attr': 'month', 'val': 103.55714797973633, 'op': '>'}, {'attr': 'engine_age', 'val': 110.3382568359375, 'op': '<='}, {'attr': 'surf_temp', 'val': 291.49920654296875, 'op': '<='}, {'attr': 'surf_temp', 'val': 271.726318359375, 'op': '>'}, {'attr': 'y_month', 'val': 10.520062446594238, 'op': '<='}, {'attr': 'year', 'val': 4.852416276931763, 'op': '>'}], 'consequence': {'attr': 'value', 'val': 'Below Average', 'op': '='}}, 'counterfactuals': [{'premises': [{'attr': 'weight', 'val': 1454706.125, 'op': '<='}, {'attr': 'y_month', 'val': 8.398653030395508, 'op': '>'}, {'attr': 'power', 'val': 9480.85546875, 'op': '<='}, {'attr': 'power', 'val': 2346.666748046875, 'op': '>'}, {'attr': 'length', 'val': 65.54714584350586, 'op': '<='}, {'attr': 'year', 'val': 16.62959909439087, 'op': '<='}], 'consequence': {'at

In [57]:
from lore_sa.encoder_decoder import ColumnTransformerEnc

tabular_enc = ColumnTransformerEnc(dataset.descriptor)
ref_value = dataset.df.iloc[0].values[:-1]
encoded = tabular_enc.encode([ref_value])
decoded = tabular_enc.decode(encoded)

print(f"Original value: {ref_value}")
print(f"Encoded value: {encoded}")
print(f"Decoded value: {decoded}")

Original value: [10.0 10.5 367.0 44 5.0 8 3.0 286.9]
Encoded value: [[10.0 10.5 367.0 44 5.0 8 3.0 286.9]]
Decoded value: [[10.0 10.5 367.0 44 5.0 8 3.0 286.9]]


In [58]:
from lore_sa.neighgen import RandomGenerator

num_row = 10
x = dataset.df.iloc[num_row][:-1]
z = tabular_enc.encode([x.values])[0] # remove the class feature from the input instance

gen = RandomGenerator(bbox=bbox, dataset=dataset, encoder=tabular_enc, ocr=0.1)
neighbour = gen.generate(z, 100, dataset.descriptor, tabular_enc)

print('Neighborhood', neighbour)

Neighborhood [[35.0 10.62 131.0 143 1836109.99652243 11 12.116722118581054 278.45]
 [35.0 10.62 131.0 143 1836109.99652243 11 12.116722118581054 278.45]
 [35.0 10.62 8501.007977801397 143 1836109.99652243 11 12.116722118581054
  278.45]
 [35.0 10.62 8501.007977801397 187.22362508920688 443573.4175696151 11
  12.116722118581054 278.45]
 [35.0 10.62 8501.007977801397 187.22362508920688 443573.4175696151 11
  12.116722118581054 278.45]
 [35.0 10.62 8501.007977801397 187.22362508920688 443573.4175696151 11
  12.116722118581054 280.4440078949402]
 [35.0 10.62 8501.007977801397 187.22362508920688 443573.4175696151 11
  12.116722118581054 280.4440078949402]
 [35.0 49.00105493286231 8501.007977801397 187.22362508920688
  443573.4175696151 11 12.116722118581054 280.4440078949402]
 [35.0 49.00105493286231 8501.007977801397 187.22362508920688
  1442435.6362703692 11 12.116722118581054 280.4440078949402]
 [35.0 49.00105493286231 8501.007977801397 187.22362508920688
  1442435.6362703692 11 12.11672

In [59]:
from lore_sa.surrogate import DecisionTreeSurrogate
# decode the neighborhood to be labeled by the blackbox model
neighb_train_X = tabular_enc.decode(neighbour)
neighb_train_y = bbox.predict(neighb_train_X)
# encode the target class to the surrogate model
neighb_train_yz = tabular_enc.encode_target_class(neighb_train_y.reshape(-1, 1)).squeeze()

dt = DecisionTreeSurrogate()
x = dt.train(neighbour, neighb_train_yz)

In [60]:
num_row = 10
x = dataset.df.iloc[num_row][:-1] # remove the class feature from the input instance
z = tabular_enc.encode([x.values])[0]
rule = dt.get_rule(z, tabular_enc)
print('rule', rule)
crules, deltas = dt.get_counterfactual_rules(z, neighbour, neighb_train_yz, tabular_enc)
print('\n crules')
for c in crules:
    print(c)


rule premises:
power <= 2356.8087158203125 
consequence: value = Poor Session

 crules
premises:
power > 2356.8087158203125
weight > 1227886.53125
month > 63.99588394165039
year > 8.63936471939087 
consequence: value = Below Average


In [61]:
dt.get_tree_structure()

<sklearn.tree._tree.Tree at 0x1b6951b4ab0>