In [1]:
from lore_sa.dataset import TabularDataset
from lore_sa.lore import TabularGeneticGeneratorLore
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from lore_sa.bbox import sklearn_classifier_bbox
import os
import pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

# Load and process the dataset
dataset = fetch_california_housing()
feature_names = list(dataset.feature_names)
target_names = [f'percentile_{i}' for i in range(5)]

prices = dataset.target

# Create a DataFrame with features and target
df = pd.DataFrame(dataset.data, columns=feature_names)

# Use qcut to create 5 equal-sized bins (20% each)
df['target'] = pd.qcut(prices, q=5, labels=target_names)

# Get the bin edges for display
_, bin_edges = pd.qcut(prices, q=5, retbins=True)
print("\nPercentile Band Mapping (20% in each band):")
for i in range(len(target_names)):
    print(f"{target_names[i]}: ${bin_edges[i]:.4f} - ${bin_edges[i+1]:.4f} (hundred thousands)")

# Save to CSV
df.to_csv('california_housing_5_split.csv', index=False)


Percentile Band Mapping (20% in each band):
percentile_0: $0.1500 - $1.0720 (hundred thousands)
percentile_1: $1.0720 - $1.5730 (hundred thousands)
percentile_2: $1.5730 - $2.0940 (hundred thousands)
percentile_3: $2.0940 - $2.9000 (hundred thousands)
percentile_4: $2.9000 - $5.0000 (hundred thousands)


In [3]:
target = 'target'
dataset = TabularDataset.from_csv('california_housing_5_split.csv', class_name=target)
dataset.df.dropna(inplace=True)

In [4]:
dataset.df.keys()

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'target'],
      dtype='object')

In [5]:
dataset.descriptor["target"]

{'target': {'index': 8,
  'distinct_values': ['percentile_4',
   'percentile_3',
   'percentile_2',
   'percentile_1',
   'percentile_0'],
  'count': {'percentile_4': 4125,
   'percentile_3': 4125,
   'percentile_2': 4132,
   'percentile_1': 4129,
   'percentile_0': 4129}}}

In [6]:
dataset.descriptor

{'numeric': {'MedInc': {'index': 0,
   'min': 0.4999,
   'max': 15.0001,
   'mean': 3.8706710029069766,
   'std': 1.8998217179452688,
   'median': 3.5347999999999997,
   'q1': 2.5633999999999997,
   'q3': 4.74325},
  'HouseAge': {'index': 1,
   'min': 1.0,
   'max': 52.0,
   'mean': 28.639486434108527,
   'std': 12.58555761211165,
   'median': 29.0,
   'q1': 18.0,
   'q3': 37.0},
  'AveRooms': {'index': 2,
   'min': 0.8461538461538461,
   'max': 141.9090909090909,
   'mean': 5.428999742190376,
   'std': 2.4741731394243187,
   'median': 5.229128787878788,
   'q1': 4.440716235896959,
   'q3': 6.052380952380952},
  'AveBedrms': {'index': 3,
   'min': 0.3333333333333333,
   'max': 34.06666666666667,
   'mean': 1.096675149606208,
   'std': 0.4739108567954661,
   'median': 1.048780487804878,
   'q1': 1.006079046038478,
   'q3': 1.099526066350711},
  'Population': {'index': 4,
   'min': 3.0,
   'max': 35682.0,
   'mean': 1425.4767441860465,
   'std': 1132.462121765341,
   'median': 1166.0,
  

In [7]:
print(f"Dataset shape: {dataset.df.shape}")
print(f"Features retained: {list(dataset.df.columns)}")

Dataset shape: (20640, 9)
Features retained: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'target']


In [8]:
def create_preprocessor(dataset: TabularDataset):
    """Create preprocessing pipeline for numeric and categorical features."""
    numeric_indices = [v['index'] for v in dataset.descriptor['numeric'].values()]
    categorical_indices = [v['index'] for v in dataset.descriptor['categorical'].values()]
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_indices),
            ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_indices)
        ]
    )
    return preprocessor

In [9]:
def find_best_hyperparameters(dataset: TabularDataset, sample_size=0.1, cv_folds=5, random_state=42):
    """Find best hyperparameters using a small subset of the data."""
    # Prepare full data
    X = dataset.df.drop([target], axis=1)
    y = dataset.df[target].values
    
    # Take 10% sample for hyperparameter search, stratified
    X_sample, _, y_sample, _ = train_test_split(
        X.values, y,
        train_size=sample_size,
        random_state=random_state,
        stratify=y
    )
    
    # Create preprocessor
    preprocessor = create_preprocessor(dataset)
    
    # Define parameter grid for RandomForestClassifier
    param_grid = {
        'randomforestclassifier__n_estimators': [50, 100, 200, 300],
        'randomforestclassifier__max_depth': [10, 20, 30, None],
        'randomforestclassifier__min_samples_split': [2, 5, 10],
        'randomforestclassifier__min_samples_leaf': [1, 2, 4],
        'randomforestclassifier__max_features': ['sqrt', 'log2'],
        'randomforestclassifier__bootstrap': [True, False]
    }
    
    # Create base pipeline
    base_pipeline = make_pipeline(
        preprocessor,
        RandomForestClassifier(random_state=random_state, n_jobs=-1)
    )
    
    print("GRID SEARCH FOR HYPERPARAMETER OPTIMIZATION (on 10% sample)")
    print(f"Sample size: {len(X_sample)} instances ({sample_size*100}% of total)")
    print(f"Total combinations to evaluate: {np.prod([len(v) for v in param_grid.values()])}")
    print(f"Cross-validation folds: {cv_folds}")
    print(f"Scoring metric: accuracy\n")
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(
        estimator=base_pipeline,
        param_grid=param_grid,
        cv=cv_folds,
        scoring='accuracy',
        n_jobs=-1,
        verbose=4,
        return_train_score=True
    )
    
    print("Starting grid search on sample...")
    grid_search.fit(X_sample, y_sample)
    print("\nGrid search completed!\n")
    
    print("BEST HYPERPARAMETERS FOUND:")
    for param, value in grid_search.best_params_.items():
        print(f"  {param}: {value}")
    print(f"\nBest CV Score on sample: {grid_search.best_score_:.4f}")
    
    return grid_search.best_params_

In [10]:
def train_final_model(dataset: TabularDataset, best_params, test_size=0.3, cv_folds=5, random_state=42):
    """Train final model on full dataset using best hyperparameters."""
    # Prepare data
    X = dataset.df.drop([target], axis=1)
    y = dataset.df[target].values
    
    X_train, X_test, y_train, y_test = train_test_split(
        X.values, y,
        test_size=test_size, 
        random_state=random_state, 
        stratify=y  # Stratify on target
    )
    
    # Create preprocessor
    preprocessor = create_preprocessor(dataset)
    
    # Extract best parameters from grid search results
    rf_params = {k.replace('randomforestclassifier__', ''): v 
                 for k, v in best_params.items()}
    
    # Create final pipeline with best parameters
    final_pipeline = make_pipeline(
        preprocessor,
        RandomForestClassifier(**rf_params, random_state=random_state, n_jobs=-1)
    )
    
    print("\nTRAINING FINAL MODEL ON FULL DATASET")
    print(f"Training set size: {len(X_train)} instances")
    print(f"Test set size: {len(X_test)} instances")
    print(f"Using best hyperparameters from grid search\n")
    
    final_pipeline.fit(X_train, y_train)
    
    # Evaluate on test set
    y_pred = final_pipeline.predict(X_test)
    
    # Calculate metrics
    test_accuracy = accuracy_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred, average='weighted')
    test_recall = recall_score(y_test, y_pred, average='weighted')
    test_f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Cross-validation score on full training set
    cv_scores = cross_val_score(final_pipeline, X_train, y_train, cv=cv_folds, scoring='accuracy')
    
    # Print results
    print("FINAL MODEL RESULTS")
    print(f"\nCross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"\nTest Set Performance:")
    print(f"  Accuracy:  {test_accuracy:.4f}")
    print(f"  Precision: {test_precision:.4f}")
    print(f"  Recall:    {test_recall:.4f}")
    print(f"  F1-Score:  {test_f1:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=dataset.descriptor["target"]["target"]["distinct_values"]))
    
    # Store results
    results = {
        'best_params': best_params,
        'cv_mean_score': cv_scores.mean(),
        'cv_std_score': cv_scores.std(),
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1': test_f1,
        'classification_report': classification_report(y_test, y_pred, target_names=dataset.descriptor["target"]["target"]["distinct_values"])
    }
    
    # Convert to LORE-compatible bbox
    bbox = sklearn_classifier_bbox.sklearnBBox(final_pipeline)
    
    return bbox, results

In [11]:
runFilePath = "DS User Case Model.pkl"
if not os.path.exists(runFilePath):
    # Step 1: Find best hyperparameters on 10% sample
    best_params = find_best_hyperparameters(
        dataset=dataset,
        sample_size=0.1,  # Use 10% of data for hyperparameter search
        cv_folds=10,
        random_state=42
    )
    
    # Step 2: Train final model on full dataset with best parameters
    bbox, results = train_final_model(
        dataset=dataset,
        best_params=best_params,
        test_size=0.3,
        cv_folds=10,
        random_state=42
    )
    
    with open(runFilePath, "wb") as f:
        pickle.dump((bbox, results, best_params), f)

with open(runFilePath, "rb") as f:
    bbox, results, best_params = pickle.load(f)

In [12]:
best_params

{'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__max_depth': 30,
 'randomforestclassifier__max_features': 'sqrt',
 'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__n_estimators': 200}

In [13]:
x = dataset.df.drop([target], axis=1).iloc[1]
print(f"\nInstance to explain:")
print(x)
print(f"Predicted class: {bbox.predict(x.values.reshape(1, -1))[0]}")


Instance to explain:
MedInc           8.301400
HouseAge        21.000000
AveRooms         6.238137
AveBedrms        0.971880
Population    2401.000000
AveOccup         2.109842
Latitude        37.860000
Longitude     -122.220000
Name: 1, dtype: float64
Predicted class: percentile_4


In [14]:
tabularLore = TabularGeneticGeneratorLore(bbox, dataset)
# explanation_default = tabularLore.explain(x, num_instances=500)
# explanation_big = tabularLore.explain(x, num_instances=3000)

In [15]:
# tabularLore.interactive_explanation(x, inJupyter=False)

In [16]:
x2 = dataset.df.drop([target], axis=1).iloc[1000]
print(f"\nInstance to explain:")
print(x2)
print(f"Predicted class: {bbox.predict(x2.values.reshape(1, -1))[0]}")


Instance to explain:
MedInc           3.384100
HouseAge        29.000000
AveRooms         4.842031
AveBedrms        1.002821
Population    1919.000000
AveOccup         2.706629
Latitude        37.690000
Longitude     -121.760000
Name: 1000, dtype: float64
Predicted class: percentile_2


In [None]:
tabularLore.interactive_explanation(x2, inJupyter=False)

INFO:     Started server process [37632]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Launching LORE_sa explanation viz webapp
Starting API server on 0.0.0.0:8000
INFO:     127.0.0.1:49348 - "GET /api/get-datasets HTTP/1.1" 200 OK
API server is ready at http://localhost:8000
NPM version detected: 10.9.3
Dependencies already installed, skipping npm install
Application started successfully!
API: http://localhost:8000/docs#/
Client: http://localhost:8080
Opening http://localhost:8080 in your default browser...
Browser opened successfully!


INFO:     127.0.0.1:52953 - "GET /api/check-custom-data HTTP/1.1" 200 OK
{'MedInc': 3.3841, 'HouseAge': 29.0, 'AveRooms': 4.842031029619182, 'AveBedrms': 1.002820874471086, 'Population': 1919.0, 'AveOccup': 2.706629055007052, 'Latitude': 37.69, 'Longitude': -121.76}
INFO:     127.0.0.1:52953 - "POST /api/explain HTTP/1.1" 200 OK
INFO:     127.0.0.1:52953 - "GET /api/get-classes-colors?method=umap HTTP/1.1" 200 OK
INFO:     127.0.0.1:52953 - "OPTIONS /api/update-visualization HTTP/1.1" 200 OK
INFO:     127.0.0.1:52953 - "POST /api/update-visualization HTTP/1.1" 200 OK
INFO:     127.0.0.1:52953 - "GET /api/get-classes-colors?method=pca HTTP/1.1" 200 OK
INFO:     127.0.0.1:63738 - "POST /api/update-visualization HTTP/1.1" 200 OK
INFO:     127.0.0.1:63738 - "GET /api/get-classes-colors?method=umap HTTP/1.1" 200 OK
{'MedInc': 3.3841, 'HouseAge': 29.0, 'AveRooms': 4.842031029619182, 'AveBedrms': 1.002820874471086, 'Population': 1919.0, 'AveOccup': 2.706629055007052, 'Latitude': 37.69, 'Longi