In [1]:

import numbers
import warnings
import sklearn
from sklearn.base import BaseEstimator
from sklearn.utils.multiclass import type_of_target
import numpy as np
import scipy.sparse as sp
from joblib import Parallel, delayed
from sklearn.base import clone, is_classifier
from sklearn.model_selection import KFold, StratifiedKFold, check_cv, GridSearchCV, BaseCrossValidator, RandomizedSearchCV
# TODO: conisder working around relying on sklearn implementation details
from sklearn.model_selection._validation import (_check_is_permutation,
                                                 _fit_and_predict)
from sklearn.exceptions import FitFailedWarning
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import indexable, check_random_state
from sklearn.utils.validation import _num_samples
from model_selection_utils import *
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection._search import BaseSearchCV
from sklearn.model_selection import train_test_split

# Load data and create a model
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
class SearchEstimatorList(BaseEstimator):
    def __init__(self, estimator_list = ['linear', 'forest'], param_grid_list = 'auto', is_discrete=False, scoring=None,
                 n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
                 error_score=np.nan, return_train_score=False):

        self.estimator_list = get_complete_estimator_list(estimator_list, 'discrete' if is_discrete else 'continuous')

        if param_grid_list == 'auto':
            self.param_grid_list = auto_hyperparameters(estimator_list=self.estimator_list, is_discrete=is_discrete)
        elif (param_grid_list == None) and (param_grid_list == 'default'):
            self.param_grid_list = len(estimator_list) * [{}]
        else:
            self.param_grid_list = param_grid_list
        # self.categorical_indices = categorical_indices
        if scoring == None:
            if is_discrete:
                self.scoring = 'mse'
            else:
                self.scoring = 'mse'
            warnings.warn(f"No scoring value was given. Using default score method {self.scoring}.")
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.refit = refit
        self.cv = cv
        self.verbose = verbose
        self.pre_dispatch = pre_dispatch
        self.error_score = error_score
        self.return_train_score = return_train_score
        return

    def select(self, X, y, *, scaling=True, sample_weight=None, groups=None):
        """
        Perform cross-validation on the estimator list.
        """
        self._search_list = []
        self.scaling = scaling
        if scaling:
            if is_data_scaled(X):
                warnings.warn("Data may already be scaled. Scaling twice may negatively affect results.", UserWarning)
            self.scaler = StandardScaler()
            self.scaler.fit(X)
            scaled_X = self.scaler.transform(X)

        for estimator, param_grid in zip(self.estimator_list, self.param_grid_list):
            try:
                temp_search = GridSearchCV(estimator, param_grid, scoring=self.scoring,
                                       n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose,
                                       pre_dispatch=self.pre_dispatch, error_score=self.error_score,
                                       return_train_score=self.return_train_score)
                if scaling: # is_linear_model(estimator) and
                    temp_search.fit(scaled_X, y, groups=groups) # , groups=groups, sample_weight=sample_weight
                    self._search_list.append(temp_search)
                else:
                    temp_search.fit(X, y,  groups=groups)
                    self._search_list.append(temp_search)
            except (ValueError, TypeError, FitFailedWarning) as e:
                # Raise a warning for the failed initialization
                warning_msg = f"Warning: {e} for estimator {estimator} and param_grid {param_grid}"
                warnings.warn(warning_msg, category=UserWarning)
        self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list])
        self.best_estimator_ = self._search_list[self.best_ind_].best_estimator_
        self.best_score_ = self._search_list[self.best_ind_].best_score_
        self.best_params_ = self._search_list[self.best_ind_].best_params_
        return self
    
    def scaler_transform(self, X):
        if self.scaling:    
            return self.scaler.transform(X)
        
    def best_model(self):
        return self.best_estimator_
    
    def predict(self, X):
        if self.scaling:    
            if is_data_scaled(X):
                warnings.warn("Data may already be scaled. Scaling twice may negatively affect results.", UserWarning)
            return self.best_estimator_.predict(self.scaler.transform(X))
        return self.best_estimator_.predict(X)
    
    def predict_prob(self, X):
        if self.scaling:    
            if is_data_scaled(X):
                warnings.warn("Data may already be scaled. Scaling twice may negatively affect results.", UserWarning)
            return self.best_estimator_.predict(self.scaler.transform(X))
        return self.best_estimator_.predict_proba(X)

## Testing string inputs

In [3]:
search = SearchEstimatorList(estimator_list = 'linear', is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE of test dataset:", mse)
print(search.predict_prob(X_test).shape)





ElasticNetCV(l1_ratio=0.9)
{'l1_ratio': 0.9, 'max_iter': 1000}
MSE of test dataset: 0.5547153680110064
(4128,)


In [4]:
search = SearchEstimatorList(estimator_list = 'poly', is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("mse of test dataset:", mse,)
print(search.predict_prob(X_test).shape)



  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Pipeline(steps=[('poly', PolynomialFeatures()),
                ('linear', ElasticNetCV(cv=3, l1_ratio=0.1))])
{'linear__l1_ratio': 0.1, 'linear__max_iter': 1000, 'poly__degree': 2}
mse of test dataset: 0.5908005047742894
(4128,)


  model = cd_fast.enet_coordinate_descent(


In [4]:
search = SearchEstimatorList(estimator_list = 'gbf', is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print("mse of test dataset:", mse,)
print(search.predict_prob(X_test).shape)





GradientBoostingRegressor(max_depth=5)
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
mse of test dataset: 0.24642752432508827
(4128,)


In [5]:
search = SearchEstimatorList(estimator_list = 'nnet', is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
 
print("mse of test dataset:", mse)
print(search.predict_prob(X_test).shape)





MLPRegressor()
{'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
mse of test dataset: 0.30709539045335965
(4128,)


In [2]:
search = SearchEstimatorList(estimator_list = ['linear', 'forest'], is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
 
print("mse of test dataset:", mse,)
print(search.predict_prob(X_test).shape)



NameError: name 'SearchEstimatorList' is not defined

In [4]:
search = SearchEstimatorList(estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'], is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
 
print("mse of test dataset:", mse,)
print(search.predict_prob(X_test).shape)





## Testing Model Objects

In [None]:
search = SearchEstimatorList(estimator_list = LogisticRegression(), is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print("mse of test dataset:", mse,)
print(search.predict_prob(X_test).shape)



LogisticRegression()
{}
Accuracy of test dataset: 1.0
Accuracy of test dataset: [1. 1. 1.]
(30, 3)




In [None]:
search = SearchEstimatorList(estimator_list = LogisticRegressionCV(), is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print("mse of test dataset:", mse,)
print(search.predict_prob(X_test).shape)





LogisticRegressionCV(Cs=1, solver='liblinear')
{'Cs': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy of test dataset: 0.8
Accuracy of test dataset: [1.         0.5        0.78571429]
(30, 3)


110 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1672, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none

In [None]:
search = SearchEstimatorList(estimator_list = LogisticRegressionCV(), is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print("mse of test dataset:", mse,)
print(search.predict_prob(X_test).shape)



{'linear__Cs': 10,
 'linear__penalty': 'l2',
 'linear__solver': 'saga',
 'poly__degree': 2}

## Edge Cases

In [None]:
search = SearchEstimatorList(estimator_list = [], is_discrete=False)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)

print("mse:", acc)
print(search.predict_prob(X_test))


ValueError: The list is empty

In [None]:
print(search.best_model())

RandomForestClassifier(n_estimators=1000)


True
True


In [None]:
RandomizedSearchCV == RandomizedSearchCV

True

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# def set_search_hyperparameters(search_object, hyperparameters):
#     if isinstance(search_object, (RandomizedSearchCV, GridSearchCV)):
#         print('hi')
#         search_object.set_params(**hyperparameters)
#     else:
#         raise ValueError("Invalid search object")

# Example usage
search = RandomizedSearchCV(None, None)

if isinstance(search, (RandomizedSearchCV, GridSearchCV)):
    print('hi')
    

hi


In [None]:
GridSearchCV(None, {})

In [None]:
import numpy as np
def is_data_scaled(X):
    """
    Check if the input data is already centered and scaled using StandardScaler.

    Args:
        X array-like of shape (n_samples, n_features): The input data.

    Returns:
        is_scaled (bool): Whether the input data is already centered and scaled using StandardScaler or not.

    """
    # Compute the mean and standard deviation of the scaled data
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    print(mean)
    print(std)
    # Check if the mean is close to 0 and the standard deviation is close to 1
    is_scaled = np.allclose(mean, 0.0) and np.allclose(std, 1.0)

    return is_scaled


In [None]:
from sklearn.preprocessing import (MaxAbsScaler, MinMaxScaler,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)

In [None]:
X = np.array([[0.0, -1.0], [1.0, 0.0], [-1.0, 1.0]])
scale = StandardScaler()
scaled_X = scale.fit_transform(X)
is_data_scaled(scaled_X)

[0. 0.]
[1. 1.]


True

In [None]:
np.std(scaled_X)

0.9999999999999999

In [None]:
np.std(scaled_X, axis=0)

array([1., 1.])

In [None]:
grid_search.fit(X, y)

TypeError: Parameter grid should be a dict or a list, got: None of type NoneType

In [None]:
model.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:

import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.linear_model
import sklearn.neural_network
import sklearn.preprocessing
from sklearn.base import BaseEstimator
from sklearn.ensemble import (GradientBoostingClassifier,
                              GradientBoostingRegressor,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV
from sklearn.model_selection import BaseCrossValidator
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (MaxAbsScaler, MinMaxScaler,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)

from sklearn.base import is_regressor
from sklearn.linear_model import (ARDRegression, BayesianRidge, ElasticNet,
                                  Lars, Lasso, LassoLars, LinearRegression,
                                  OrthogonalMatchingPursuit, Ridge)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, \
    OrthogonalMatchingPursuit, Lars, LassoLars, BayesianRidge, ARDRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC

def is_linear_model(estimator):
    """
    Check whether an estimator is a polynomial regression, logistic regression, linear SVM, or any other type of
    linear model.

    Parameters:
    estimator (scikit-learn estimator): The estimator to check.

    Returns:
    is_linear (bool): True if the estimator is a linear model, False otherwise.
    """

    # Check if the estimator is a polynomial regression
    if isinstance(estimator, Pipeline):
        has_poly_feature_step = any(isinstance(step[1], PolynomialFeatures) for step in estimator.steps)
        if has_poly_feature_step:
            return True

    # Check if the estimator is a linear regression or related model
    if hasattr(estimator, 'fit_intercept') and hasattr(estimator, 'coef_'):
        return True

    # Check if the estimator is a logistic regression or linear SVM
    if isinstance(estimator, (LogisticRegression, LinearSVC, SVC)):
        return True

    # Otherwise, the estimator is not a linear model
    return False


In [None]:
hasattr(LinearRegression(), 'fit_intercept')

True

In [None]:
is_linear_model(LinearRegression())

False

In [None]:

import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.linear_model
import sklearn.neural_network
import sklearn.preprocessing
from sklearn.base import BaseEstimator
from sklearn.ensemble import (GradientBoostingClassifier,
                              GradientBoostingRegressor,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV
from sklearn.model_selection import BaseCrossValidator
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (MaxAbsScaler, MinMaxScaler,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)

def is_linear_model(estimator):
    """
    Check whether an estimator is a polynomial regression or any other type of linear model.

    Parameters:
    estimator (scikit-learn estimator): The estimator to check.

    Returns:
    is_linear (bool): True if the estimator is a linear model, False otherwise.
    """

    # Check if the estimator is a polynomial regression
    if isinstance(estimator, Pipeline):
        has_poly_feature_step = any(isinstance(step[1], PolynomialFeatures) for step in estimator.steps)
        if has_poly_feature_step:
            return True

    # Check if the estimator is any other type of linear model
    if is_regressor(estimator) and isinstance(estimator, (LinearRegression, Ridge, Lasso, ElasticNet, 
                                                          OrthogonalMatchingPursuit, Lars, LassoLars, 
                                                          BayesianRidge, ARDRegression)):
        return True

    # Otherwise, the estimator is not a linear model
    return False




In [None]:
from sklearn.base import is_regressor

is_linear_model(LinearRegression())

NameError: name 'LinearRegression' is not defined

In [None]:
type(BaseEstimator()) == elastic

False

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator
from sklearn.model_selection import BaseCrossValidator

lr = [LinearRegression()]
if not isinstance(lr[0], (str, BaseEstimator, BaseCrossValidator)):
    raise TypeError("The list must contain only strings, sklearn model objects, and sklearn model selection objects.")
print("Yay")

Yay


In [None]:
import sklearn.linear_model
import sklearn.ensemble
import sklearn.neural_network
import sklearn.preprocessing
import numpy as np

def select_estimator(estimator_type, target_type):
    if target_type not in ['continuous', 'discrete']:
        raise ValueError(f"Unsupported target type: {target_type}")
    if estimator_type == 'linear':
        if target_type == 'continuous':
            return sklearn.linear_model.ElasticNetCV()
        elif target_type == 'discrete':
            return sklearn.linear_model.LogisticRegressionCV()
    elif estimator_type == 'forest':
        if target_type == 'continuous':
            return sklearn.ensemble.RandomForestRegressor()
        elif target_type == 'discrete':
            return sklearn.ensemble.RandomForestClassifier()
    elif estimator_type == 'gbf':
        if target_type == 'continuous':
            return sklearn.ensemble.GradientBoostingRegressor()
        elif target_type == 'discrete':
            return sklearn.ensemble.GradientBoostingClassifier()
    elif estimator_type == 'nnet':
        if target_type == 'continuous':
            return sklearn.neural_network.MLPRegressor()
        elif target_type == 'discrete':
            return sklearn.neural_network.MLPClassifier()
    elif estimator_type == 'poly':
        degrees = [2, 3, 4]  
        models = []
        if target_type == 'continuous':
            return sklearn.linear_model.ElasticNetCV(precompute=True)
        elif target_type == 'discrete':
            return sklearn.linear_model.LogisticRegressionCV()
    elif estimator_type == 'automl':
        return    
    elif estimator_type == 'all':
        if target_type == 'continuous':
            return sklearn.ensemble.VotingRegressor(estimators=[
                ('linear', select_estimator('linear', target_type)),
                ('forest', select_estimator('forest', target_type)),
                ('gbf', select_estimator('gbf', target_type)),
                ('nnet', select_estimator('nnet', target_type)),
                ('poly', select_estimator('poly', target_type)),
            ], voting='soft')
        elif target_type == 'discrete':
            return sklearn.ensemble.VotingClassifier(estimators=[
                ('linear', select_estimator('linear', target_type)),
                ('forest', select_estimator('forest', target_type)),
                ('gbf', select_estimator('gbf', target_type)),
                ('nnet', select_estimator('nnet', target_type)),
                ('poly', select_estimator('poly', target_type)),
            ], voting='soft')


SyntaxError: invalid syntax (1604001357.py, line 1)

In [None]:
# abc = ['a', 'b', 'c']
abc = 'abc'
if not isinstance(abc, list):
    print('hi')

hi


In [None]:
import sklearn.linear_model
import sklearn.ensemble
import sklearn.neural_network
import sklearn.preprocessing
import numpy as np
import sklearn.pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.datasets import make_regression, make_classification
from sklearn.model_selection import train_test_split
import numpy as np

# param_grid = {

target_type = 'continuous'
degrees = [2, 3, 4]  
models = []
for degree in degrees:
    poly = sklearn.preprocessing.PolynomialFeatures(degree=degree)
    if target_type == 'continuous':
        linear = sklearn.linear_model.ElasticNetCV(precompute=True, cv=3, tol=0.1, verbose=1)
    elif target_type == 'discrete':
        linear = sklearn.linear_model.LogisticRegressionCV()
    else:
        raise ValueError(f"Unsupported target type: {target_type}")
    models.append((f"poly{degree}", sklearn.pipeline.Pipeline([('poly', poly), ('linear', linear)])))
model = sklearn.ensemble.VotingRegressor(estimators=models)

# generate some regression data
X, y = make_regression(n_samples=1000, n_features=20, random_state=42)

# split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8)

model.fit(X_train, y_train)

# predict on the test set
y_pred = model.predict(X_test)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
.................

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(mse)

1566.204247883811
