In [1]:

from plantbrain_fastml.managers.regressor_manager import RegressorManager
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import pandas as pd
import time
# Load California housing dataset as DataFrame for compatibility
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Initialize manager and add models (if not already added)
manager = RegressorManager()
# (If RegressorManager already adds models in __init__, no need to add explicitly)
# Otherwise, you can add explicitly:
# manager.add_model("linear_regression", LinearRegressionRegressor())
# manager.add_model("random_forest", RandomForestRegressorWrapper())

# Evaluate all models on training data with hypertuning and feature elimination
time_start = time.time()
results = manager.evaluate_all(
    X_train, y_train,
    hypertune=False,
    hypertune_params={'n_trials': 2},
    n_jobs=6,  # Set to -1 for all cores
    cv_folds=5,
    test_size=0.1,
    feature_elimination=True,
    fe_n_features=5,
    fe_method='lasso',
    return_plots=True  # set True if you want plots
)

print("Evaluation Results (CV + Test):")
print(results)

# Get best model by metric (e.g., 'rmse')
best_model_name, best_model = manager.get_best_model(metric='rmse', higher_is_better=False)
print(f"\nBest model by RMSE: {best_model_name}")
time_end = time.time()

# Evaluate best model on the test set separately


  from .autonotebook import tqdm as notebook_tqdm


Evaluation Results (CV + Test):
                   cv_rmse_mean  cv_rmse_std  cv_mae_mean  cv_mae_std  \
model                                                                   
linear_regression      0.804085     0.130024     0.547236    0.003277   
elastic_net            0.879462     0.008786     0.682731    0.005334   
knn_regression         0.633650     0.010226     0.437635    0.004728   
bayesian_ridge         0.804091     0.130040     0.547226    0.003273   
decision_tree          0.714191     0.016612     0.452715    0.008101   
lasso                  0.977362     0.010784     0.770404    0.007469   
ridge                  0.804086     0.130027     0.547234    0.003276   
adaboost               0.830984     0.078340     0.696649    0.084215   
gradient_boosting      0.535417     0.013299     0.370083    0.005052   
random_forest          0.514039     0.010519     0.332940    0.003713   
svr                    0.767928     0.015141     0.540767    0.008894   

                  

In [2]:
print(f"Time taken for evaluation: {(time_end - time_start)//60} minutes")

Time taken for evaluation: 1.0 minutes


In [3]:
manager.get_hyperparameters()

{'linear_regression': {},
 'random_forest': {},
 'decision_tree': {},
 'svr': {},
 'knn_regression': {},
 'gradient_boosting': {},
 'elastic_net': {},
 'bayesian_ridge': {},
 'adaboost': {},
 'lasso': {},
 'ridge': {}}

In [5]:
import matplotlib
print(matplotlib.__version__)


3.10.3


In [6]:
manager.get_plots()['linear_regression']['scatter']

KeyError: 'scatter'

In [12]:
manager.eval_results

{'cv_scores': {'rmse': (np.float64(0.8040853634139525),
   np.float64(0.1300242365136351)),
  'mae': (np.float64(0.5472359711631587), np.float64(0.003277065357366334)),
  'r2': (np.float64(0.5003432933141967), np.float64(0.1815338559505305))},
 'test_scores': {'rmse': np.float64(0.717256211776781),
  'mae': 0.5291309251606496,
  'r2': 0.6206697989665664},
 'plots': {'line': <Figure size 640x480 with 1 Axes>,
  'scatter': <Figure size 640x480 with 1 Axes>}}

In [3]:
from plantbrain_fastml.managers.regressor_manager import RegressorManager
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import pandas as pd
import time
import matplotlib.pyplot as plt

# It's good practice to wrap the entire script logic in a main function
def main():
    # --- 1. Load and Prepare the Dataset ---
    print("Loading the Diabetes dataset...")
    data = load_diabetes()
    X = pd.DataFrame(data.data, columns=data.feature_names)
    y = pd.Series(data.target)
    print("Dataset loaded successfully.")
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")

    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")


    # --- 2. Initialize the Regressor Manager ---
    print("\nInitializing the RegressorManager...")
    manager = RegressorManager()
    print("RegressorManager initialized.")
    print("Available models:", list(manager.models.keys()))


    # --- 3. Evaluate All Models Using Default Metrics ---
    print("\nStarting model evaluation...")
    start_time = time.time()

    results = manager.evaluate_all(
        X_train,
        y_train,
        hypertune=True,
        hypertune_params={'n_trials': 2},
        hypertune_metrics='r2',
        n_jobs=-2,
        cv_folds=3,
        test_size=0.2,
        feature_elimination=True,
        fe_n_features=5,
        fe_method='lasso',
        return_plots=True
    )

    end_time = time.time()
    print(f"\nEvaluation completed in {end_time - start_time:.2f} seconds.")


    # --- 4. Display Results and Get Best Model ---
    print("\n--- Evaluation Results (Cross-Validation & Test Scores) ---")
    print(results)

    # Check if results are not empty before proceeding
    if not results.empty:
        print("\n--- Getting the Best Model ---")
        best_model_name, best_model_object = manager.get_best_model(metric='rmse', higher_is_better=False)
        print(f"Best performing model based on RMSE: '{best_model_name}'")

        hyperparams = manager.get_hyperparameters()
        print(f"\nTuned Hyperparameters for {best_model_name}:")
        print(hyperparams.get(best_model_name))


        # --- 5. Make Predictions with the Best Model ---
        print("\n--- Making Predictions with the Best Model ---")

        # The preprocessor was already fitted during the evaluate_all call.
        # First, process the TRAINING data.
        print("Applying feature elimination to the training set...")
        X_train_processed = best_model_object.preprocessor.transform(X_train)

        # **** THIS IS THE NEW, CRUCIAL STEP ****
        # Now, train the best model on the processed training data.
        print(f"Training the final '{best_model_name}' model on the full training set...")
        best_model_object.train(X_train_processed, y_train) # Use the .train() method of your wrapper

        # Now, process the TEST data using the same preprocessor
        print("Applying the same feature elimination to the test set...")
        X_test_processed = best_model_object.preprocessor.transform(X_test)

        # Finally, make predictions on the processed test data
        predictions = best_model_object.predict(X_test_processed)

        print("\nPredictions made on the processed test set.")
        sample_comparison = pd.DataFrame({'Actual': y_test.values, 'Predicted': predictions}).head(10)
        print(sample_comparison)
        plots=manager.get_plots()

        if plots and best_model_name in plots:
            scatter_plot = plots[best_model_name].get('scatter')
            if scatter_plot:
                print("Displaying Predicted vs. Actual scatter plot...")
                # In a script you would use scatter_plot.show(), in a notebook this will display it
                display(scatter_plot)
        else:
            print(f"No plots found for model '{best_model_name}'.")
    else:
        print("\nEvaluation produced no results. Cannot determine the best model.")

    print("\nNotebook execution finished.")

# This is the crucial part!
# This tells Python to only run the main() function when the script is executed directly.
if __name__ == '__main__':
    main()

Loading the Diabetes dataset...
Dataset loaded successfully.
Features shape: (442, 10)
Target shape: (442,)

Splitting data into training and testing sets...
Training set size: 353
Test set size: 89

Initializing the RegressorManager...
RegressorManager initialized.
Available models: ['linear_regression', 'random_forest', 'decision_tree', 'svr', 'knn_regression', 'gradient_boosting', 'elastic_net', 'bayesian_ridge', 'adaboost', 'lasso', 'ridge']

Starting model evaluation...

Evaluation completed in 21.32 seconds.

--- Evaluation Results (Cross-Validation & Test Scores) ---
                   cv_rmse_mean  cv_rmse_std  cv_mae_mean  cv_mae_std  \
model                                                                   
linear_regression    159.822265     6.886711   148.749428    7.166158   
knn_regression        60.000761     2.680674    48.495329    2.520772   
svr                   61.197223     1.960027    48.288840    1.142992   
elastic_net           56.585578     0.628126    45.462

In [1]:
import multiprocessing
from joblib import Parallel, delayed


total_cores = multiprocessing.cpu_count()

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from plantbrain_fastml.managers.classifier_manager import ClassifierManager

# --- 1. Load and Prepare Data ---
print("Loading the Breast Cancer dataset...")
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')
print("Dataset loaded successfully.")

# Split data, ensuring stratified split for classification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


# --- 2. Initialize the Classifier Manager ---
manager = ClassifierManager()
print("\nRegressorManager initialized.")
print("Available models:", list(manager.models.keys()))


# --- 3. Define Metrics and Evaluate All Models ---

# Define the metrics to calculate as a DICTIONARY
# Note: roc_auc requires probability scores, which our BaseClassifier handles.
classification_metrics_to_calculate = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score,
    'roc_auc': roc_auc_score
}

print("\nStarting model evaluation...")
start_time = time.time()

# Evaluate all models, using 'roc_auc' as the goal for hyperparameter tuning
results = manager.evaluate_all(
    X_train,
    y_train,
    metrics=classification_metrics_to_calculate,
    hypertune=True,
    hypertune_params={'n_trials': 10}, # n_trials can be increased for a more thorough search
    hypertune_metrics='roc_auc',
    n_jobs=3, # Use all available CPU cores
)

end_time = time.time()
print(f"\nEvaluation completed in {end_time - start_time:.2f} seconds.")


# --- 4. Display Results and Get Best Model ---
print("\n--- Evaluation Results ---")
print(results)

print("\n--- Getting the Best Model ---")
# Get the best model based on the test set's roc_auc score
best_model_name, best_model_object = manager.get_best_model(metric='roc_auc', higher_is_better=True)
print(f"Best performing model based on Test ROC AUC: '{best_model_name}'")


# --- 5. In-depth Analysis of the Best Model ---
print(f"\n--- Analysis of Best Model: {best_model_name} ---")

# Get the detailed classification report from the fitted model
# Note: This requires a custom method on the BaseClassifier, which you've added.
report = best_model_object.get_classification_report()
if report:
    print("Classification Report on Test Set:")
    print(pd.DataFrame(report).transpose())

# Get tuned hyperparameters from the manager
hyperparams = manager.get_hyperparameters()
print(f"\nTuned Hyperparameters for {best_model_name}:")
print(hyperparams.get(best_model_name))


# --- 6. Make Predictions on New Data ---
print("\n--- Making Predictions on the Test Set ---")
# Use the best model's own preprocessor to transform the test data
X_test_processed = best_model_object.preprocessor.transform(X_test)

# Get final class predictions
print("Applying feature elimination to the training set...")
X_train_processed = best_model_object.preprocessor.transform(X_train)

# **** THIS IS THE NEW, CRUCIAL STEP ****
# Now, train the best model on the processed training data.
print(f"Training the final '{best_model_name}' model on the full training set...")
best_model_object.train(X_train_processed, y_train) # Use the .train() method of your wrapper

# Now, process the TEST data using the same preprocessor
print("Applying the same feature elimination to the test set...")
X_test_processed = best_model_object.preprocessor.transform(X_test)

# Finally, make predictions on the processed test data
predictions = best_model_object.predict(X_test_processed)

# Get prediction probabilities
probabilities = best_model_object.predict_proba(X_test_processed)

print("Sample Predictions:")
print(predictions[:5])
print("\nSample Probabilities (for class 0 and 1):")
print(probabilities[:5])

print("\nNotebook execution finished.")

  from .autonotebook import tqdm as notebook_tqdm


Loading the Breast Cancer dataset...
Dataset loaded successfully.
Training set size: 455
Test set size: 114

RegressorManager initialized.
Available models: ['random_forest', 'logistic_regression', 'svc']

Starting model evaluation...

Evaluation completed in 29.70 seconds.

--- Evaluation Results ---
                     cv_accuracy_mean  cv_accuracy_std  cv_precision_mean  \
model                                                                       
svc                          0.945091         0.012126           0.940319   
logistic_regression          0.967047         0.013941           0.973629   
random_forest                0.958828         0.012168           0.956298   

                     cv_precision_std  cv_recall_mean  cv_recall_std  \
model                                                                  
svc                          0.020827        0.974534       0.020913   
logistic_regression          0.020723        0.974172       0.016359   
random_forest          