In [1]:

from plantbrain_fastml.managers.regressor_manager import RegressorManager
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import pandas as pd
import time
# Load California housing dataset as DataFrame for compatibility
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Initialize manager and add models (if not already added)
manager = RegressorManager()
# (If RegressorManager already adds models in __init__, no need to add explicitly)
# Otherwise, you can add explicitly:
# manager.add_model("linear_regression", LinearRegressionRegressor())
# manager.add_model("random_forest", RandomForestRegressorWrapper())

# Evaluate all models on training data with hypertuning and feature elimination
time_start = time.time()
results = manager.evaluate_all(
    X_train, y_train,
    hypertune=False,
    hypertune_params={'n_trials': 2},
    n_jobs=6,  # Set to -1 for all cores
    cv_folds=5,
    test_size=0.1,
    feature_elimination=True,
    fe_n_features=5,
    fe_method='lasso',
    return_plots=True  # set True if you want plots
)

print("Evaluation Results (CV + Test):")
print(results)

# Get best model by metric (e.g., 'rmse')
best_model_name, best_model = manager.get_best_model(metric='rmse', higher_is_better=False)
print(f"\nBest model by RMSE: {best_model_name}")
time_end = time.time()

# Evaluate best model on the test set separately


  from .autonotebook import tqdm as notebook_tqdm


Evaluation Results (CV + Test):
                   cv_rmse_mean  cv_rmse_std  cv_mae_mean  cv_mae_std  \
model                                                                   
linear_regression      0.804085     0.130024     0.547236    0.003277   
elastic_net            0.879462     0.008786     0.682731    0.005334   
knn_regression         0.633650     0.010226     0.437635    0.004728   
bayesian_ridge         0.804091     0.130040     0.547226    0.003273   
decision_tree          0.714191     0.016612     0.452715    0.008101   
lasso                  0.977362     0.010784     0.770404    0.007469   
ridge                  0.804086     0.130027     0.547234    0.003276   
adaboost               0.830984     0.078340     0.696649    0.084215   
gradient_boosting      0.535417     0.013299     0.370083    0.005052   
random_forest          0.514039     0.010519     0.332940    0.003713   
svr                    0.767928     0.015141     0.540767    0.008894   

                  

In [2]:
print(f"Time taken for evaluation: {(time_end - time_start)//60} minutes")

Time taken for evaluation: 1.0 minutes


In [3]:
manager.get_hyperparameters()

{'linear_regression': {},
 'random_forest': {},
 'decision_tree': {},
 'svr': {},
 'knn_regression': {},
 'gradient_boosting': {},
 'elastic_net': {},
 'bayesian_ridge': {},
 'adaboost': {},
 'lasso': {},
 'ridge': {}}

In [5]:
import matplotlib
print(matplotlib.__version__)


3.10.3


In [6]:
manager.get_plots()['linear_regression']['scatter']

KeyError: 'scatter'

In [12]:
manager.eval_results

{'cv_scores': {'rmse': (np.float64(0.8040853634139525),
   np.float64(0.1300242365136351)),
  'mae': (np.float64(0.5472359711631587), np.float64(0.003277065357366334)),
  'r2': (np.float64(0.5003432933141967), np.float64(0.1815338559505305))},
 'test_scores': {'rmse': np.float64(0.717256211776781),
  'mae': 0.5291309251606496,
  'r2': 0.6206697989665664},
 'plots': {'line': <Figure size 640x480 with 1 Axes>,
  'scatter': <Figure size 640x480 with 1 Axes>}}

In [None]:
from plantbrain_fastml.managers.regressor_manager import RegressorManager
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import pandas as pd
import time
import matplotlib.pyplot as plt

# --- 1. Load and Prepare the Dataset ---
print("Loading the Diabetes dataset...")
data = load_diabetes()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
print("Dataset loaded successfully.")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

print("\nSplitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


# --- 2. Initialize the Regressor Manager ---
print("\nInitializing the RegressorManager...")
manager = RegressorManager()
print("RegressorManager initialized.")
print("Available models:", list(manager.models.keys()))


# --- 3. Evaluate All Models Using Default Metrics ---
print("\nStarting model evaluation...")
start_time = time.time()

# Evaluate all models WITHOUT passing a custom metrics list.
# This will use the default metrics, which include 'rmse'.
results = manager.evaluate_all(
    X_train,
    y_train,
    hypertune=False,
    hypertune_params={'n_trials': 5},
    n_jobs=1,
    cv_folds=3,
    test_size=0.2,
    feature_elimination=True,
    fe_n_features=5,
    fe_method='lasso',
    return_plots=True
)

end_time = time.time()
print(f"\nEvaluation completed in {end_time - start_time:.2f} seconds.")


# --- 4. Display Results and Get Best Model ---
print("\n--- Evaluation Results (Cross-Validation & Test Scores) ---")
print(results)

print("\n--- Getting the Best Model ---")
# Get the best model by 'rmse'. Lower is better.
best_model_name, best_model_object = manager.get_best_model(metric='rmse', higher_is_better=False)
print(f"Best performing model based on RMSE: '{best_model_name}'")


# --- 5. Make Predictions with the Best Model ---
print("\n--- Making Predictions with the Best Model ---")
# best_model_object was returned from the get_best_model call earlier

# Use the best model's own fitted preprocessor to transform the test data
print("Applying the same feature elimination to the test set...")
X_test_processed = best_model_object.preprocessor.transform(X_test)

# Now make predictions on the data that has the correct 5 features
predictions = best_model_object.predict(X_test_processed)

print("\nPredictions made on the processed test set.")
# Display sample predictions vs. actual values
sample_comparison = pd.DataFrame({'Actual': y_test.values, 'Predicted': predictions}).head(10)
print(sample_comparison)

# --- 6. Display Plots for the Best Model ---
print(f"\n--- Displaying Plots for '{best_model_name}' ---")
plots = manager.get_plots()

if plots and best_model_name in plots:
    # Show the scatter plot (Predicted vs. Actual)
    scatter_plot = plots[best_model_name].get('scatter')
    if scatter_plot:
        print("Displaying Predicted vs. Actual scatter plot...")
        scatter_plot.show()

    # Show the feature importance plot (if available)
    feature_importance_plot = plots[best_model_name].get('feature_importance')
    if feature_importance_plot:
        print("\nDisplaying Feature Importance plot...")
        feature_importance_plot.show()
else:
    print(f"No plots found for model '{best_model_name}'.")

print("\nNotebook execution finished.")

  from .autonotebook import tqdm as notebook_tqdm


Loading the Diabetes dataset...
Dataset loaded successfully.
Features shape: (442, 10)
Target shape: (442,)

Splitting data into training and testing sets...
Training set size: 353
Test set size: 89

Initializing the RegressorManager...
RegressorManager initialized.
Available models: ['linear_regression', 'random_forest', 'decision_tree', 'svr', 'knn_regression', 'gradient_boosting', 'elastic_net', 'bayesian_ridge', 'adaboost', 'lasso', 'ridge']

Starting model evaluation...

Evaluation completed in 2.56 seconds.

--- Evaluation Results (Cross-Validation & Test Scores) ---
                   cv_rmse_mean  cv_rmse_std  cv_mae_mean  cv_mae_std  \
model                                                                   
linear_regression     56.607819     0.976782    45.406853    0.920921   
random_forest         60.790774     2.901088    47.973298    2.717225   
decision_tree         79.080974     3.550187    60.581560    2.079061   
svr                   74.856356     1.409415    61.4935

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- age
- s2
- s3
- s4
- s6


In [1]:
import multiprocessing
from joblib import Parallel, delayed


total_cores = multiprocessing.cpu_count()

In [2]:
total_cores

12

In [8]:
_get_effective_n_jobs(-1)

12