The aim of this notebook will be to find the following data:
* decision tree to plot in the visualization
* the generated neighbourhood generated by LORE

Everything will be tested on the iris dataset for quick running times

### dataset import

In [54]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import json
from sklearn.preprocessing import LabelEncoder

In [55]:
from sklearn.datasets import fetch_openml

In [56]:
# Load MNIST dataset
print("Loading MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False, parser='auto')
print("Dataset loaded")

# Take a subset for quicker processing (optional)
n_samples = 5000
X = X[:n_samples]
y = y[:n_samples]

# Create feature names for MNIST pixels
feature_names = [f'pixel_{i}' for i in range(X.shape[1])]

Loading MNIST dataset...
Dataset loaded


### LORE initial

read data

In [57]:
from lore_sa.dataset import TabularDataset
import pandas as pd

In [58]:
# Convert to dictionary format for TabularDataset
data_dict = {name: X[:, i] for i, name in enumerate(feature_names)}
target_name = 'target'
data_dict[target_name] = y

# Create TabularDataset
dataset = TabularDataset.from_dict(data_dict, 'target')
dataset.df.dropna(inplace=True)

train model

In [59]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from lore_sa.bbox import sklearn_classifier_bbox

In [60]:
def train_model_generalized(dataset: TabularDataset, target_name: str):
    # For MNIST, all features are numeric
    numeric_indices = list(range(len(feature_names)))
    categorical_indices = []  # No categorical features in MNIST

    preprocessor = StandardScaler()

    # Remove rare classes
    valid_classes = dataset.df[target_name].value_counts()[dataset.df[target_name].value_counts() > 1].index
    dataset.df = dataset.df[dataset.df[target_name].isin(valid_classes)]

    # Select features and target
    X = dataset.df.iloc[:, numeric_indices]
    y = dataset.df[target_name]

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                test_size=0.3, random_state=42, stratify=y)

    # Create and train model
    model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
    model.fit(X_train, y_train)
    
    return sklearn_classifier_bbox.sklearnBBox(model)

In [61]:
bbox = train_model_generalized(dataset, target_name)

In [62]:
dataset.descriptor

{'numeric': {'pixel_0': {'index': 0,
   'min': 0,
   'max': 0,
   'mean': 0.0,
   'std': 0.0,
   'median': 0.0,
   'q1': 0.0,
   'q3': 0.0},
  'pixel_1': {'index': 1,
   'min': 0,
   'max': 0,
   'mean': 0.0,
   'std': 0.0,
   'median': 0.0,
   'q1': 0.0,
   'q3': 0.0},
  'pixel_2': {'index': 2,
   'min': 0,
   'max': 0,
   'mean': 0.0,
   'std': 0.0,
   'median': 0.0,
   'q1': 0.0,
   'q3': 0.0},
  'pixel_3': {'index': 3,
   'min': 0,
   'max': 0,
   'mean': 0.0,
   'std': 0.0,
   'median': 0.0,
   'q1': 0.0,
   'q3': 0.0},
  'pixel_4': {'index': 4,
   'min': 0,
   'max': 0,
   'mean': 0.0,
   'std': 0.0,
   'median': 0.0,
   'q1': 0.0,
   'q3': 0.0},
  'pixel_5': {'index': 5,
   'min': 0,
   'max': 0,
   'mean': 0.0,
   'std': 0.0,
   'median': 0.0,
   'q1': 0.0,
   'q3': 0.0},
  'pixel_6': {'index': 6,
   'min': 0,
   'max': 0,
   'mean': 0.0,
   'std': 0.0,
   'median': 0.0,
   'q1': 0.0,
   'q3': 0.0},
  'pixel_7': {'index': 7,
   'min': 0,
   'max': 0,
   'mean': 0.0,
   'std': 0

### encoding decoding

In [63]:
from lore_sa.encoder_decoder import ColumnTransformerEnc

tabular_enc = ColumnTransformerEnc(dataset.descriptor)
ref_value = dataset.df.iloc[0].values[:-1]
encoded = tabular_enc.encode([ref_value])
decoded = tabular_enc.decode(encoded)

print(f"Original value: {ref_value}")
print(f"Encoded value: {encoded}")
print(f"Decoded value: {decoded}")

Original value: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 3 18 18 18 126 136 175 26 166 255 247 127 0 0 0 0 0 0 0 0 0 0 0 0
 30 36 94 154 170 253 253 253 253 253 225 172 253 242 195 64 0 0 0 0 0 0 0
 0 0 0 0 49 238 253 253 253 253 253 253 253 253 251 93 82 82 56 39 0 0 0 0
 0 0 0 0 0 0 0 0 18 219 253 253 253 253 253 198 182 247 241 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 80 156 107 253 253 205 11 0 43 154 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 14 1 154 253 90 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 139 253 190 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 11 190 253 70 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 35 241
 225 160 108 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 81 240 253
 253 119 25 0 0

the value to use for the prediction can be found via:

In [64]:
ref_value = dataset.df.iloc[0].values[:-1]
list(ref_value)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 18,
 18,
 18,
 126,
 136,
 175,
 26,
 166,
 255,
 247,
 127,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 30,
 36,
 94,
 154,
 170,
 253,
 253,
 253,
 253,
 253,
 225,
 172,
 253,
 242,
 195,
 64,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 49,
 238,
 253,
 253,
 253,
 253,
 253,
 253,
 253,
 253,
 251,
 93,
 82,
 82,
 56,
 39,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 18,

and decoded using

In [65]:
encoded = tabular_enc.encode([ref_value])
decoded = tabular_enc.decode(encoded)
list(decoded[0])

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 18,
 18,
 18,
 126,
 136,
 175,
 26,
 166,
 255,
 247,
 127,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 30,
 36,
 94,
 154,
 170,
 253,
 253,
 253,
 253,
 253,
 225,
 172,
 253,
 242,
 195,
 64,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 49,
 238,
 253,
 253,
 253,
 253,
 253,
 253,
 253,
 253,
 251,
 93,
 82,
 82,
 56,
 39,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 18,

### neighborhood generation

In [66]:
from lore_sa.neighgen import RandomGenerator

select the istance to explain

In [67]:
num_row = 0
x = dataset.df.iloc[num_row][:-1]
x

pixel_0      0
pixel_1      0
pixel_2      0
pixel_3      0
pixel_4      0
            ..
pixel_779    0
pixel_780    0
pixel_781    0
pixel_782    0
pixel_783    0
Name: 0, Length: 784, dtype: object

encode it

In [68]:
z = tabular_enc.encode([x.values])[0] # remove the class feature from the input instance

creates the neighborhood

In [69]:
gen = RandomGenerator(bbox=bbox, dataset=dataset, encoder=tabular_enc, ocr=0.1)
neighbour = gen.generate(z, 100, dataset.descriptor, tabular_enc)
neighbour

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

### surrogate model

In [70]:
from lore_sa.surrogate import DecisionTreeSurrogate

create X, y and yz for the decision tree surrogate

In [71]:
# decode the neighborhood to be labeled by the blackbox model
neighb_train_X = tabular_enc.decode(neighbour)
neighb_train_y = bbox.predict(neighb_train_X)
# encode the target class to the surrogate model
neighb_train_yz = tabular_enc.encode_target_class(neighb_train_y.reshape(-1, 1)).squeeze()

train the surrogate model

In [72]:
dt = DecisionTreeSurrogate()
x = dt.train(neighbour, neighb_train_yz)

### data extraction for the plots

#### decision tree

get the decision tree _tree for plotting

In [73]:
dt.get_dt().tree_

<sklearn.tree._tree.Tree at 0x139d43faea0>

In [74]:
from sklearn.tree import DecisionTreeClassifier
from dataclasses import dataclass
from typing import List, Optional
from dataclasses import asdict

@dataclass
class TreeNode:
    """Class to store decision tree node information"""
    # Unique identifier for the node in the tree
    node_id: int
    
    # The name of the feature used for the decision at this node. 
    # If the node is a leaf, this will be `None`.
    feature_name: Optional[str]
    
    # The threshold value for the feature used to split the data at this node. 
    # If the node is a leaf, this will be `None`.
    threshold: Optional[float]
    
    # The node ID of the left child node. If the node is a leaf, this will be `None`.
    left_child: Optional[int]
    
    # The node ID of the right child node. If the node is a leaf, this will be `None`.
    right_child: Optional[int]
    
    # Indicates whether this node is a leaf node (`True` if leaf, `False` if internal).
    is_leaf: bool
    
    # The class label predicted by the leaf node. 
    # Only set if the node is a leaf; otherwise, it is `None`.
    class_label: Optional[str]
    
    # The number of samples (data points) that reached this node during training.
    samples: int

def extract_tree_structure(tree_classifier: DecisionTreeClassifier, feature_names: List[str], target_names: List[str]) -> List[TreeNode]: 
    """
    Extract node information from a trained DecisionTreeClassifier

    Parameters:
    -----------
    tree_classifier : DecisionTreeClassifier
        A trained sklearn DecisionTreeClassifier
    feature_names : List[str]
        A list of feature names
    target_names : List[str]
        A list of target class labels

    Returns:
    --------
    List[TreeNode]
        List of TreeNode objects containing the tree structure
    """
    tree = tree_classifier.tree_
    
    nodes = []

    for node_id in range(tree.node_count):
        # Check if node is leaf
        is_leaf = tree.children_left[node_id] == -1

        # Get node information
        if is_leaf:
            # Get the class label based on the majority class in the leaf
            class_label_index = int(tree.value[node_id].argmax())
            class_label = target_names[class_label_index]
            
            node = TreeNode(
                node_id=node_id,
                feature_name=None,
                threshold=None,
                left_child=None,
                right_child=None,
                is_leaf=True,
                class_label=class_label,
                samples=int(tree.n_node_samples[node_id])
            )
        else:
            feature_name = feature_names[int(tree.feature[node_id])]
            threshold = float(tree.threshold[node_id])
            left_child = int(tree.children_left[node_id])
            right_child = int(tree.children_right[node_id])

            node = TreeNode(
                node_id=node_id,
                feature_name=feature_name,
                threshold=threshold,
                left_child=left_child,
                right_child=right_child,
                is_leaf=False,
                class_label=None,
                samples=int(tree.n_node_samples[node_id])
            )

        nodes.append(node)

    return nodes

def generate_decision_tree_visualization_data(nodes):
    """
    Save the tree structure to a JSON file
    
    Parameters:
    -----------
    nodes : List[TreeNode]
        List of TreeNode objects to save
    filename : str
        Path to save the JSON file
    indent : int
        Number of spaces for indentation
    """
    # Convert TreeNodes to dictionaries
    nodes_dict = [asdict(node) for node in nodes]
    
    return nodes_dict

In [75]:
dataset.descriptor.keys(), dataset.descriptor["target"]["target"]['distinct_values']

(dict_keys(['numeric', 'categorical', 'ordinal', 'target']),
 ['5', '0', '4', '1', '9', '2', '3', '6', '7', '8'])

In [76]:
with open ("loreTreeTest.json", "w") as f:
    json.dump(generate_decision_tree_visualization_data (
        nodes= extract_tree_structure(
            tree_classifier=dt.get_dt(),
            feature_names=list(dataset.descriptor["numeric"].keys()),
            target_names=list(dataset.descriptor["target"]["target"]['distinct_values'])
        )
    ), f, indent=4)

#### PCA

In [77]:
dt.get_dt()

In [82]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.spatial import Voronoi
from shapely.geometry import Polygon
from shapely.ops import unary_union
import networkx as nx
from sklearn.cluster import KMeans

def preprocess_data(X):
    """
    Standardize the data and apply PCA transformation.
    
    Parameters:
    -----------
    X : array-like
        Input features
        
    Returns:
    --------
    tuple
        (transformed_data, pca_model, scaler_model)
    """
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    X_scaled = scaler.fit_transform(X)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca, pca, scaler

def generate_decision_boundary_grid(X_pca, step=0.1):
    """
    Generate a grid for decision boundary visualization.
    
    Parameters:
    -----------
    X_pca : array-like
        PCA transformed features
    step : float, default=0.1
        Step size for the grid
        
    Returns:
    --------
    tuple
        (xx, yy) meshgrid arrays and (x_min, x_max, y_min, y_max) boundaries
    """
    x_min, x_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
    y_min, y_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, step),
        np.arange(y_min, y_max, step)
    )
    return xx, yy, (x_min, x_max, y_min, y_max)

def filter_points_by_class_kmeans(points, original_data, labels, threshold=500, threshold_multiplier=4, random_state=42):
    """
    Filter points using K-means clustering to reduce data density, while preserving the original order.
    
    Instead of stacking by class (which reorders the data), we collect the original indices for each class,
    perform the filtering, and then sort the indices to restore the original ordering.
        
    Parameters:
    -----------
    points : array-like
        Input points to filter
    original_data : array-like
        Original feature data corresponding to points
    labels : array-like
        Class labels for points
    threshold : int, default=500
        Maximum number of points per class
    threshold_multiplier : int, default=4
        Multiplier for initial sampling size
    random_state : int, default=42
        Random state for reproducibility
        
    Returns:
    --------
    tuple
        (filtered_points, filtered_original_data, filtered_labels)
    """
    selected_indices = []
    unique_classes = np.unique(labels)
    
    for cls in unique_classes:
        # Get the indices in the original order for this class.
        class_indices = np.where(labels == cls)[0]
        class_points = points[class_indices]
        n_points = len(class_points)
        
        if n_points > threshold:
            sample_size = min(threshold * threshold_multiplier, n_points)
            rng = np.random.RandomState(random_state)
            sampled_indices_local = rng.choice(n_points, size=sample_size, replace=False)
            sampled_points = class_points[sampled_indices_local]
            
            kmeans = KMeans(n_clusters=threshold, random_state=random_state, n_init=10)
            kmeans.fit(sampled_points)
            centroids = kmeans.cluster_centers_
            
            # For each centroid, choose the point closest to it
            for centroid in centroids:
                distances = np.linalg.norm(class_points - centroid, axis=1)
                closest_local_index = np.argmin(distances)
                # Map back to the original index
                selected_indices.append(class_indices[closest_local_index])
        else:
            selected_indices.extend(class_indices.tolist())
    
    # Sort the indices to maintain original order
    selected_indices = np.sort(selected_indices)
    
    filtered_points = points[selected_indices]
    filtered_original = original_data[selected_indices]
    filtered_labels = labels[selected_indices]
    return filtered_points, filtered_original, filtered_labels

def create_voronoi_regions(xx, yy, Z, class_names):
    """
    Create Voronoi regions for decision boundaries.
    
    Parameters:
    -----------
    xx : array-like
        X coordinates of the grid
    yy : array-like
        Y coordinates of the grid
    Z : array-like
        Predicted classes for the grid points
    class_names : list
        List of class names corresponding to numeric indices
        
    Returns:
    --------
    tuple
        (merged_regions, merged_classes) where merged_classes contains actual class names
    """
    G = nx.Graph()
    vor = Voronoi(np.c_[xx.ravel(), yy.ravel()])
    regions, vertices = vor.regions, vor.vertices

    region_class_map = {}
    region_polygons = []
    region_class_list = []
    region_index_map = {}

    polygon_idx = 0
    for point_index, region_index in enumerate(vor.point_region):
        region = regions[region_index]
        if not -1 in region and len(region) > 0:
            polygon = Polygon([vertices[i] for i in region])
            region_polygons.append(polygon)
            # Map the numeric class to the actual class name
            class_idx = Z.ravel()[point_index]
            region_class_map[region_index] = class_names[class_idx]
            region_class_list.append(class_names[class_idx])
            region_index_map[region_index] = polygon_idx
            G.add_node(region_index)
            polygon_idx += 1

    # Find adjacent regions
    for (p1, p2), ridge_vertices in zip(vor.ridge_points, vor.ridge_vertices):
        if -1 in ridge_vertices:
            continue
        r1, r2 = vor.point_region[p1], vor.point_region[p2]
        
        if region_class_map.get(r1) == region_class_map.get(r2):
            G.add_edge(r1, r2)

    # Merge connected regions
    merged_regions = []
    merged_classes = []

    for component in nx.connected_components(G):
        merged_polygon = unary_union([
            region_polygons[region_index_map[i]] 
            for i in component 
            if i in region_index_map
        ])
        merged_regions.append(merged_polygon)
        merged_classes.append(region_class_map[list(component)[0]])

    return merged_regions, merged_classes

def format_pc_label(pc_loadings, feature_names, pc_index):
    """
    Format the principal component label with feature contributions.
    
    Parameters:
    -----------
    pc_loadings : array-like
        Principal component loadings
    feature_names : list
        List of feature names
    pc_index : int
        Index of the principal component
        
    Returns:
    --------
    str
        Formatted label
    """
    return f"PC{pc_index + 1}: " + ", ".join(
        [f"{name} ({value:+.2f})" for name, value in zip(feature_names, pc_loadings)]
    )

def generate_pca_visualization_data(feature_names, class_names, X, y, pretrained_tree, step=0.1):
    """
    Generate PCA visualization data and decision boundaries for a pre-trained decision tree.
    
    Parameters:
    -----------
    feature_names : list
        List of feature names
    X : array-like
        Input features
    y : array-like
        Target labels
    pretrained_tree : DecisionTreeClassifier
        Pre-trained decision tree classifier on original (non-PCA) data
    step : float, default=0.1
        Step size for decision boundary grid
        
    Returns:
    --------
    dict
        Visualization data including PCA coordinates, original data, and decision boundaries
    """

    # Transform data
    X_pca, pca, scaler = preprocess_data(X)
    
    # Generate grid in PCA space
    xx, yy, (x_min, x_max, y_min, y_max) = generate_decision_boundary_grid(X_pca, step)
    
    # Transform grid points back to original space for prediction
    grid_points = np.c_[xx.ravel(), yy.ravel()]
    grid_original = pca.inverse_transform(grid_points)
    grid_original = scaler.inverse_transform(grid_original)
    
    # Get predictions using the pre-trained tree
    Z = pretrained_tree.predict(grid_original).reshape(xx.shape)
    
    # Filter PCA points and original data
    filtered_pca_data, filtered_original_data, filtered_labels = filter_points_by_class_kmeans(
        X_pca, X, y, threshold=2000, threshold_multiplier=5
    )
    
    # Create Voronoi regions with class names
    merged_regions, merged_classes = create_voronoi_regions(xx, yy, Z, class_names)
    
    # Format PC labels
    pc1_label = format_pc_label(pca.components_[0], feature_names, 0)
    pc2_label = format_pc_label(pca.components_[1], feature_names, 1)
    
    # Convert original data to lists of pd.Series
    original_series_list = [
        pd.Series(row, index=feature_names).to_dict()
        for row in filtered_original_data
    ]
    
    # Prepare visualization data
    visualization_data = {
        "pcaData": filtered_pca_data.tolist(),
        "originalData": original_series_list,
        "targets": filtered_labels.tolist(),
        "decisionBoundary": {
            "regions": [list(p.exterior.coords) for p in merged_regions],
            "regionClasses": merged_classes,  # Now contains actual class names
            "xRange": [float(x_min), float(x_max)],
            "yRange": [float(y_min), float(y_max)],
        },
        "xAxisLabel": pc1_label,
        "yAxisLabel": pc2_label,
    }
    
    return visualization_data


In [85]:

with open ("lorePCATest.json", "w") as f:
    json.dump(generate_pca_visualization_data (
        feature_names=feature_names, 
        class_names=dataset.descriptor["target"]["target"]['distinct_values'], 
        X=neighb_train_X, 
        y=neighb_train_y, 
        pretrained_tree=dt.get_dt(), 
        step = 0.1
    ), f, indent=4)