In [1]:
# -*- coding: utf-8 -*-
"""
Machine Learning part 

@Author : Pierre CLaver 
Date:2024.12.11 

"""


#
# Install the package watex to get the advantages of the ML utilites
# ! pip install watex

'\nMachine Learning part \n\n@Author : Pierre CLaver \nDate:2023.12.11 \n\n'

In [2]:
#import pandas as pd
#import joblib
#pd.__version__

#joblib.__version__
#import xgboost as xgb
#xgb.__version__

In [3]:
#import numpy as np

In [4]:
#np.__version__

In [5]:
# Import requiered modules
# XXX IMPORTANT ! (1)
import warnings
import contextlib
import copy
import re
import numpy as np
import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
import joblib

import watex as wx
from watex.utils import naive_scaler  # naive_imputer,
from watex.utils import savejob  # make_naive_pipe,
from watex.utils.funcutils import smart_format, is_iterable, is_in_if
# from watex.utils import bin_counting, bi_selector
# from watex.exlib import (
#     train_test_split,

# )
from watex.exlib import (
    KNeighborsClassifier,
    DecisionTreeClassifier,
    LogisticRegression,
    SVC,
    RandomForestClassifier,
    AdaBoostClassifier,
    # StandardScaler,
    # Normalizer,
    # RobustScaler,
    # MinMaxScaler,
    GridSearchCV,
    RandomizedSearchCV,
    accuracy_score,
    precision_score,
    recall_score,

)
from watex.models.validation import getGlobalScores
from watex.utils.box import Boxspace
from watex.exlib.gbm import XGBClassifier
from watex.utils.validator import get_estimator_name
# from sklearn.discriminant_analysis import LDA
# from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB

# from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score  # roc_auc_score,
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# # GRID PARAMS
# from scipy.stats import expon  # uniform, randint
from skopt.space import Categorical, Integer, Real
from skopt.searchcv import BayesSearchCV
# import time

# from sklearn.
from lightgbm import LGBMClassifier
# import CatBoost
# from pprint import pprint
#
# * Preprocessing
# loading the dataset
# p
data = wx.read_data('alsani.csv',
                    #  sanitize=True
                    )

#

In [6]:
# Functions utilities

# XXX IMPORTANT ! (2)


def scale_data(d, scaler, save=False, filename=None, **scaler_kws):
    """Scale data using sklearn scaling estimator  

    Parameters 
    ------------
    d: Arraylike of pandas Dataframe 
       Arraylike of DataFRame containing the valid numeric data. 

    d: :class:`~sklearn.preprocessing.*` 
       The scaling estimator . It could be `StandardScaler`, `RobustScaler` , 
       `Normalizer` or `RobustScaler` etc 

    """
    # sc = scaler ( **scaler_kws )
    # X_transf = sc.fit_transform ( d )
    # if hasattr ( sc, 'feature_names_in_') :
    #     X_transf = pd.DataFrame ( X_transf, columns = sc.feature_names_in_)

    # or use the simple function below
    try:
        X_transf = naive_scaler(d, kind=scaler, **scaler_kws)
    except:
        sc = scaler(**scaler_kws)
        X_transf = sc.fit_transform(d)
        if hasattr(sc, 'feature_names_in_'):
            X_transf = pd.DataFrame(X_transf, columns=sc.feature_names_in_)

    if save:
        filename = filename or get_estimator_name(scaler)
        # remove the'csv' extension if given
        filename = filename.replace('.csv', '')
        X_transf.to_csv(f'{filename}.csv', index=False)

    return X_transf


class EstimatorError (BaseException):
    pass


def make_Xy_scale_data(*X, y, scalers: list, save=False, filename='DATA'):
    """ Scale the data and aggregate numerical and categorical features into 
    a single dataset X, y. 
    If ``save=True`` then picked the data. 

    Parameters 
    ---------
    X: List of Arraylike , pd.DataFrame 
      It can be Xnumerical and categorical  
    scalers: Type of Sklearn Scaler estimators 

    save: bool, default=False 
      Pickle the data or save to Joblib 
    filename: str, 
      Name of pickled file 


    """
    Xnum, Xcat = X
    # X = pd.concat ([*X], axis =1 )
    data_pickled = {}
    for scaler in scalers:
        #
        X0 = pd.concat([scale_data(Xnum, scaler=scaler), Xcat], axis=1)
        data_pickled[get_estimator_name(scaler)] = (X0, y)
    if save:
        savejob(data_pickled, savefile=filename)

    return data_pickled if not save else None


def reduce_Xy(
    X,
    components,
    save=False,
    filename='PCADATA',
    ** reduce_kws
):
    """ Reduce X with PCA 

    Parameters 
    --------------
    X: ArrayLike or pd.DataFrame 
      Array or Dataframe withs shape (n_samples, n_features ) 
    components: list, ArrayLike of int 
       List of components 

    filename: str, default ='PCADATA'
       Name of pickled file 

    """

    data_reduced = {}
    for comp in components:

        pca = PCA(n_components=comp, ** reduce_kws)
        X_red = pca.fit_transform(X)

        data_reduced['comp{:02}'.format(comp)] = X_red

    if save:
        savejob(data_reduced, savefile=filename)

    return data_reduced if not save else None


def fetch_pickled_data(file):
    """ Get the data from the binary disk """
    return joblib.load(file)


def optimization(X, y, cv,  estimator,
                 param_grid, optimizer=GridSearchCV,
                 scoring="accuracy",  verbose=0,
                 return_o=True, return_both=False,
                 comp_name=None,
                 **optimizer_kws

                 ):
    """ CRoss validate the data using the optimiser """

    N_ITER = 10
    N_jOBS = -1
    if get_estimator_name(optimizer) == 'BayesSearchCV':
        opm = optimizer(estimator, search_spaces=param_grid, cv=cv,
                        scoring=scoring,
                        n_iter=N_ITER,
                        n_jobs=N_jOBS,
                        **optimizer_kws)
    elif get_estimator_name(optimizer) == 'RandomizedSearchCV':
        opm = optimizer(
            estimator,
            param_distributions=param_grid,
            cv=cv,
            scoring=scoring,
            n_iter=N_ITER,
            n_jobs=N_jOBS,
            **optimizer_kws

        )
    else:
        opm = optimizer(estimator, param_grid=param_grid,
                        cv=cv, scoring=scoring,
                        n_jobs=N_jOBS,
                        **optimizer_kws)

    opm.fit(X, y)

    # Params that we need
    dict_params = dict(
        best_params_=opm.best_params_,
        best_estimator_=opm.best_estimator_,
        cv_results_=opm.cv_results_,
    )

    if verbose:
        print("+"*90)
        print("{:<20}:{:>45}".format(
            "Estimator {}".format(comp_name), get_estimator_name(estimator)))
        print("{:<20}:{:>45}".format(
            "Optimizer", get_estimator_name(optimizer)))
        print("{:<20}:{:>45}".format("CV", cv))
        print("{:<20}:{:>45}".format('Scoring', scoring))
        print("+"*90)

    o = copy.deepcopy(dict_params)

    if return_both:
        return_o = True
    if return_o:
        o = Boxspace(**dict_params)

    if return_both:
        o = (o, dict_params)

    return o


def evaluator(X, y, estimators, grid_params,  optimizers, cvs,
              return_o=True, verbose=0, return_both=False, comp_name=None,
              ):
    # evaluate multiples estimator with parameters
    map_estimators = {
        "LogisticRegression": "LR",
        'KNeighborsClassifier': 'KNN',
        'DecisionTreeClassifier': 'DT',
        'SVC': "SVM",
        'RandomForestClassifier': "RF",
        'AdaBoostClassifier': "ADA",
        'XGBClassifier': "XGB",
        'GaussianNB': "NB",
        "LGBMClassifier": "LGBM",
        "LinearDiscriminantAnalysis": "LDA",
        "Ridge": "Ridge"
    }

    cv_seekers = {"GridSearchCV": "GSCV",
                  "RandomizedSearchCV": "RSCV",
                  "BayesSearchCV": "BSCV"
                  }

    if return_both:
        return_o = True
    # DICT = {}
    estimator_dict = {}
    estimator_dict2 = {}

    for estimator in estimators:

        if get_estimator_name(estimator) not in map_estimators.keys():
            raise EstimatorError(
                f"{get_estimator_name( estimator ) } not found. Please check"
                " your dict_params grid keys."
            )
        estimator_acronym = map_estimators.get(get_estimator_name(estimator))

        dict_estimator_params = grid_params.get(estimator_acronym)
        grid_dict = {}
        grid_dict2 = {}
        for optimizer in optimizers:
            # get the estimator params from search estimator
            # name
            search_acronym = cv_seekers.get(get_estimator_name(
                optimizer))
            grid_param = dict_estimator_params.get(search_acronym)
            # -------------------------------
            cv_dict = {}
            cv_dict2 = {}
            for cv in cvs:
                dict_cv_results = optimization(
                    X, y,
                    cv=cv,
                    estimator=estimator,
                    param_grid=grid_param,
                    optimizer=optimizer,
                    return_o=return_o,
                    return_both=return_both,
                    comp_name=comp_name,
                    verbose=verbose
                )

                if return_both:
                    dict_cv_results, dict_cv_results2 = dict_cv_results
                    cv_dict2[f"cv{cv}"] = dict_cv_results2

                cv_dict[f"cv{cv}"] = dict_cv_results

            if return_o:
                cv_box = Boxspace(** cv_dict)

            else:
                cv_box = copy.deepcopy(cv_dict)
            # ---------------------------------
            if return_both:
                grid_dict2[f"{search_acronym}"] = cv_dict2

            grid_dict[f"{search_acronym}"] = cv_box

        if return_both:
            estimator_dict2[f"{estimator_acronym}"] = grid_dict2

        if return_o:
            grid_box = Boxspace(
                ** grid_dict
            )

        else:
            grid_box = copy.deepcopy(grid_dict)

        estimator_dict[f"{estimator_acronym}"] = grid_box

    estimator_o = copy.deepcopy(estimator_dict)

    if return_o:
        estimator_o = Boxspace(**estimator_dict)

    if return_both:
        estimator_o = (estimator_o, estimator_dict2)

    return estimator_o


def predict(estimator, X_test=None, y_true=None,
            data=None, component=None,
            verbose=False):

    # ------------------assert data -----------------------------------------
    X_test, y_true, component = _validate_data(X_test, y_true, data, component)
    # ---------------------------------------------------------------------
    y_pred = estimator.predict(X_test)
    ac_score = accuracy_score(y_true, y_pred)
    rec_score = recall_score(y_true, y_pred)
    prec_score = precision_score(y_true, y_pred)
    f1_scor = f1_score(y_true, y_pred)

    if verbose:
        print("-"*110)
        print("{:^110}".format(get_estimator_name(estimator)+(
              "''" if not component else f"({str(component)})"))
              )
        print("-"*110)
        print("|{:^26}|{:^26}|{:^26}|{:^26}|".format(
            "Accuracy", "F1_score", "Precision", "Recall")
        )
        print("-"*110)
        print("|{:>26}|{:>26}|{:>26}|{:>26}|".format(
            *[round(sc, 4) for sc in (
                ac_score, f1_scor, prec_score, rec_score)])
              )
        print("-"*110)

    dict_scores = dict(component_=component,
                       accuracy_=ac_score,
                       f1_score_=f1_scor,
                       precision_=prec_score,
                       recall_=rec_score
                       )
    return Boxspace(**dict_scores)


def _validate_data(
        X=None, y=None,  data=None, component=None):
    """ Validate the data either from pickled dictionnary or raw X, y. """
    if isinstance(data, dict):
        if not component:
            raise ValueError(" Missing component. Dictionnary"
                             " need the component to be specified.")
        try:
            regex_val = re.search(r'\d+', str(component),
                                  flags=re.IGNORECASE).group()
        except:
            raise TypeError("Expect a digit in the given component."
                            f" Got {component}")
        regex_val = int(regex_val)

        Xy = data.get(f"comp{regex_val}")
        if Xy is None:
            raise ValueError("Component {component} does not exist in pickled"
                             f" data. Valid keys are {smart_format( data.keys())}")
        X, y = Xy

    if X is None or y is None:
        raise TypeError(" Need X, y")

    return X, y, component


In [7]:
# Check the shape of the trainset and test set after the split

# XXX IMPORTANT ! (3)
dict_train_data = fetch_pickled_data("traindata.pca.joblib")
dict_test_data = fetch_pickled_data("testdata.pca.joblib")

# # check the dimension for components 10, 55 as examples

X10train, y10train = dict_train_data.get("comp10")
X10test, y10test = dict_test_data.get("comp10")


In [8]:
# XXX IMPORTANT ! (4)

dict_estimators = {
    # OPTIMIZE LR
    "LR": {
        "GSCV": {
            # [0.01, 0.1, 1, 10, 100],
            'C': [np.log2(x) for x in np.arange(1, 10)],
            'solver': ['liblinear', 'saga'],
            'penalty': ['l1', 'l2'],  # I added
        },
        "RSCV": {
            # np.logspace(-4, 4, 20),
            'C': [np.log2(x) for x in np.arange(1, 10)],
            'solver': ['liblinear', 'saga'],
            'penalty': ['l1', 'l2'],  # I added
        },
        "BSCV": {
            # (1e-6, 1e+6, 'log-uniform'),
            'C': Real(np.log2(2), np.log2(10)),
            'solver': Categorical(['liblinear', 'saga']),
            'penalty': Categorical(['l1', 'l2']),  # I added
        },
    },
    # OPTIMIZE KNN
    "KNN": {
        "GSCV": {
            'n_neighbors': range(1, 31),
            'metric': ['euclidean', 'manhattan'],
            'weights': ['uniform', 'distance'],
        },
        "RSCV": {
            'n_neighbors': range(1, 31),
            'metric': ['euclidean', 'manhattan'],
            'weights': ['uniform', 'distance'],
        },
        "BSCV": {
            'n_neighbors': Integer(1, 31),  # (1,30)
            'metric': Categorical(['euclidean', 'manhattan']),
            'weights': Categorical(['uniform', 'distance']),
        },
    },
    # OPTIMIZE SVM
    "SVM": {
        "GSCV": {
            # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'C': [np.log2(x) for x in np.arange(2, 10)],
            'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
            'class_weight': [None, 'balanced'],
            'gamma': ['scale', 'auto'],
            'degree': [1, 2, 3, 4, 5],  # Integer(1, 5),
            'coef0': range(10)  # Real(0, 10),
        },
        "RSCV": {
            'C': [np.log2(x) for x in np.arange(2, 10)],  # expon(scale=100),
            'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
            'gamma': ['scale', 'auto'],
            'class_weight': [None, 'balanced'],
            'degree': [1, 2, 3, 4, 5],
            'coef0': range(10)  # [i for i in range(10)]# Real(0, 10),
        },
        "BSCV": {
            'C': Real(np.log2(2), np.log2(10)),
            # Real(1e-6, 1e+1, prior='log-uniform'),
            'gamma': Categorical(['scale', 'auto']),
            'kernel':  Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
            'degree': Integer(1, 5),
            'coef0': Integer(1, 10),
            'class_weight': Categorical([None, 'balanced']),
        },
    },
    # OPTIMIZE XGB
    "XGB": {
        "GSCV": {
            'max_depth': range(1, 30),  # [3, 5, 7, 9],
            'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
            'n_estimators': [10, 20, 30],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        },
        "RSCV": {
            # Integer(1, 30),  # [i for i in np.arange(3, 10)],
            'max_depth': range(1, 30),
            # np.random.uniform(0.001, 0.01, 4),
            'learning_rate':  [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'n_estimators': [10, 20, 30],  # [np.random.randint(50, 300)],
            # [np.random.uniform(0.7, 0.3)],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            # np.random.uniform(0.7, 0.3, 4)
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        },
        "BSCV": {
            'max_depth':  Integer(1, 30),  # Integer(3, 10),
            # Real(0.01, 0.3),
            'learning_rate': Real(0.0001, 10),
            'n_estimators': Integer(10, 30),  # Integer(50, 300),
            # [0.7, 0.8, 0.9, 1.0],  # Real(0.7, 1.0),
            'subsample': Real(0.7, 1.),
            # [0.7, 0.8, 0.9, 1.0],  # Real(0.7, 1.0)
            'colsample_bytree': Real(.7, 1.),
        },
    },
    # OPTIMIZE NB
    "NB": {
        "GSCV": {
            # Additive (Laplace/Lidstone) smoothing parameter
            'var_smoothing': [i for i in np.arange(1e-4, 1e+4, 10)],
            # Whether to learn class prior probabilities or not}
            # 'priors': ['0', '1']
        },
        "RSCV": {
            # Additive (Laplace/Lidstone) smoothing parameter
            'var_smoothing': [i for i in np.arange(1e-4, 1e+4, 10)],
            # Whether to learn class prior probabilities or not}
            # 'priors': ['0', '1']
        },
        "BSCV": {
            # [i for i in np.arange(1e-4, 1e+4, 10)],
            'var_smoothing': Real(1e-4, 1e+4)
        },
    },
    # OPTIMIZE DT
    "DT": {
        "GSCV": {
            'max_depth': [1, 10, 20, 30],
            'min_samples_split': range(2, 10),  # [2, 5, 8, 10],
            'min_samples_leaf': range(1, 20),  # [1, 2, 3, 4],
            'criterion': ['gini', 'entropy'],
        },
        "RSCV": {
            'max_depth': range(1, 30),  # [None] + list(range(5, 55, 5)),
            'min_samples_split': [np.random.randint(2, 10)],
            'min_samples_leaf': [np.random.randint(1, 20)],
            'criterion': ['gini', 'entropy'],
        },
        "BSCV": {
            'max_depth': Integer(1, 30),
            'min_samples_split': Integer(2, 10),
            'min_samples_leaf': Integer(1, 20),
            'criterion': Categorical(['gini', 'entropy']),
        },
    },
    # OPTIMIZE RF
    "RF": {
        "GSCV": {
            'n_estimators': [10, 20, 30],
            'max_depth': [1, 10, 20, 30],
            'min_samples_split': range(2, 10),  # [2, 5, 10],
            'min_samples_leaf': range(1, 20),  # [1, 2, 4],
            'bootstrap': [True, False],
        },
        "RSCV": {
            'n_estimators': [10, 20, 30],  # Integer(10, 30),
            'max_depth': range(1, 30),  # [None] + list(range(5, 55, 5)),
            'min_samples_split': [np.random.randint(2, 4)],
            'min_samples_leaf': [np.random.randint(1, 20)],
            'bootstrap': [True, False],
        },
        "BSCV": {
            'n_estimators': Integer(10, 30),  # [ 10, 20, 30],
            'max_depth': Integer(1, 30),  # Integer(5, 50),
            'min_samples_split': Integer(2, 10),
            'min_samples_leaf': Integer(1, 20),
            'bootstrap': Categorical([True, False]),
        },
    },
    # OPTIMIZE LDA
    "LDA": {
        "GSCV": {
            'solver': ['svd', 'lsqr', 'eigen'],
            # 'shrinkage': [None, 'auto'],
        },
        "RSCV": {
            'solver': ['svd', 'lsqr', 'eigen'],
            # 'shrinkage': [None, 'auto'],
        },
        "BSCV": {
            'solver': Categorical(['svd', 'lsqr', 'eigen']),
            # 'shrinkage': [None, 'auto'],
        },
    },
    # OPTIMIZE ADA
    "ADA": {
        "GSCV": {
            'n_estimators': [10, 20, 30],
            'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'base_estimator': [DecisionTreeClassifier(max_depth=d) for d in range(1, 4)],
        },
        "RSCV": {
            'n_estimators': [10, 20, 30],  # Integer(10, 30),
            'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'base_estimator': [DecisionTreeClassifier(max_depth=d) for d in range(1, 4)],
        },
        "BSCV": {
            'n_estimators': Integer(10, 30),  # [10, 20, 30],
            # [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'learning_rate': Real(1e-4, 10.),
            'base_estimator': Categorical([DecisionTreeClassifier(max_depth=d) for d in range(1, 4)]),
        },
    },
    # OPTIMIZE LGBM
    "LGBM": {
        "GSCV": {
            'num_leaves': range(20, 150),   # [31, 50, 100],
            'max_depth': [1, 10, 20, 30],
            'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'n_estimators': [10, 20, 30],
            'boosting_type': ['gbdt', 'goss'],
        },
        "RSCV": {
            'num_leaves': [np.random.randint(20, 150)],
            'max_depth': [np.random.randint(1, 30)],
            # [np.random.uniform(0.01, 0.2)],
            'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'n_estimators': [np.random.randint(10, 30)],
            'boosting_type': ['gbdt', 'goss'],

        },
        # XXX FIT ::RFERENCE::
        "BSCV": {
            'num_leaves': Integer(20, 150),
            'max_depth': Integer(1, 30),
            'learning_rate':  [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'n_estimators': Integer(10, 30),
            # 'colsample_bytree': Real(0.5, 1.0),
            'boosting_type': Categorical(['gbdt', 'goss']),
        },
    }

In [9]:

# XXX IMPORTANT ! (5)


def run_metadata(
    estimators,
    grid_params,
    optimizers, data,
    cvs=(5, 10),
    components=None,
    save_as=None,
    savefile='run_inv.joblib',
    append_date=False,
    append_versions=False,
    verbose=False,
):
    """ Run Data with multiple estimators 

    Parameters 
    ------------
    estimators: :class:`sklearn.Base.BaseEstimators 
       List of scikit-learn estimators or gradient boosting 

     grid_parameters : dict 
       DIctionnary of all estimator grid parameters for cross-validation 

     optimizers: :class:sklearn.validation.~`. 
       List of Sklearn optimizers. It can be ['GridSearchCV', 'RandomizedSearchCV'
                                              'BayesSearchCV']
     data: dict, 
       Binary Pickle or JOblib dictionnary data. 

     cvs: list, default =(5, 10) 
       Cross validation folds 

     components: list, optional 
       List of PCA component data to retrieve from the `data`. If ``None``
       all components i.e. all data stored as binary pickled data should 
       be used instead. 

     save_as: str,OPtional , {'object', 'dict', 'both'), default={'dict'}
        Save data as dictionnary of object in the binary disk. 
        Default, it saves it into a dictionnary.  if 'both', it saves 
        object and dictionnary data. 

    savefile: Str, default= ''run_inv.joblib' 
      The name of the file to stored the modeling results. 

    append_date: bool, default=False 
      Append the data  to the `savefile`.

    append_versions: bool, defaut =False, 
      Append the requirement dependencies to the `savefile`.

    verbose: bool, default=False 
      Control the level of verbosity. If ``False``, mute the process. 

    Return 
    -------
    oo: :class:`watex.utils.box.Boxspace` 
       Modeling object created that stored all the informations during the 
       validation and modeling. 


    Examples
    ----------
    >>> # fetch the best estimator from the pickled 
    >>> # dictionnary data 
    >>> dict_train_data = fetch_pickled_data("traindata.pca.joblib")
    >>> # run now 
    >>> oo = run_metadata ( 
        estimators = [SVC() ], 
        optimizers =[ RandomizedSearchCV], 
        components = (10, 15 ), 
        grid_params = dict_estimators, 
        data = dict_train_data, 
        cvs= (5, 10), 
        verbose=True, 
        save_as = 'object', 

        )
    >>> oo.data10.SVM.RSCV.cv5.best_estimator_
    Out[101]: SVC(C=305.69100027133396, gamma=1e-08, kernel='linear')
    >>> oo.data10.SVM.RSCV.cv5.best_params_
    Out[102]: 
    {'C': 305.69100027133396,
     'class_weight': None,
     'gamma': 1e-08,
     'kernel': 'linear'}


    """
    return_both = False
    return_o = False

    save_as = str(save_as).lower().strip()

    if save_as.find('both') >= 0:
        return_both = True
        return_o = True
    elif save_as.find('obj') >= 0:
        save_as = 'object'
        return_o = True

    if not components:
        components = list(data.keys())

    components = is_iterable(components, exclude_string=True, transform=True)

    data_dict = {}
    data_dict2 = {}
    fo = None
    for comp in components:
        if verbose:
            print("{:-^110}".format(f"DATA:{comp}"))
        X, y, component = _validate_data(data=data, component=comp)

        try:
            o = evaluator(
                X=X,
                y=y,
                # [LogisticRegression() , SVC()] ,
                estimators=estimators,
                grid_params=grid_params,  # dict_estimators,
                optimizers=optimizers,  # [RandomizedSearchCV],
                cvs=cvs,  # [5, 10] ,
                verbose=verbose,  # rue
                return_both=return_both,
                return_o=return_o,
                comp_name="data={}".format(str(comp).replace("comp", ""))
            )
        except:
            o = (Boxspace(**{"eRROR": f"eRROR--component={comp}"}),
                 {}
                 )

        if return_both:
            o, data_dict2 = o

        data_dict[f"data{comp}"] = o

        if str(comp).find("55") < 0:
            continue

    if return_o:
        fo = Boxspace(**data_dict)

    else:
        fo = copy.deepcopy(data_dict)

    file_data = dict(datao=fo, data=data_dict, data2=data_dict2)

    if savefile:
        savejob(file_data,
                savefile=savefile,
                append_date=append_date,
                append_versions=append_versions,
                )
    return fo


#

In [10]:
 # XXX IMPORTANT ! (6)

dict_train_data = fetch_pickled_data("traindata.pca.joblib")
# # %%

In [11]:
 # XXX IMPORTANT ! (7)
# run all


In [12]:
# load data
DATA = joblib.load("ModResults.joblib")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
# %%
datao = DATA['datao']
#     )

def display_results(data_dict, stop='10'):
    map_estimators = {
        "LogisticRegression": "LR",
        'KNeighborsClassifier': 'KNN',
        'DecisionTreeClassifier': 'DT',
        'SVC': "SVM",
        'RandomForestClassifier': "RF",
        'AdaBoostClassifier': "ADA",
        'XGBClassifier': "XGB",
        'GaussianNB': "NB",
        "LGBMClassifier": "LGBM",
        "LinearDiscriminantAnalysis": "LDA",
        # "Ridge": "Ridge"
    }
    maincomp = [10, 15, 20, 25, 30, 35, 40, 45, 50]
    components = [f'comp{i}' for i in maincomp]
    
    if stop is not None: 
        components = is_in_if ( [f"comp{stop}"], components, return_intersect=True)
    if components is None: 
        raise TypeError(f"data {stop} is not available. Expect {smart_format(maincomp)}")
    est_keys = list(map_estimators.values())
    cvs = ['cv5', 'cv10']
    grids = ['RSCV', 'GSCV', 'BSCV']
    datao = data_dict['datao']

    print("+"*320)
    print("{:^15} | {:^15} | {:^5} | {:^5} | {:^150} | {:^40} | {:^80}".format(
        "DATA", "Estimators", "Grid", "CV",  "best_params",  "Train scores (acc)", "Test scores"))
    print("+"*320)
    for comp in components:
        data = f"data{comp}"
        for estimator in est_keys:
            for grid in grids:
                for cv in cvs:
                    cv_obj = getattr(
                        getattr(getattr(getattr(datao, data), estimator), grid), cv)
                    best_estimator = cv_obj.best_estimator_
                    best_params = cv_obj.best_params_
                    ttest_scores, std_scores = getGlobalScores(
                        cv_obj.cv_results_)
                    po = predict(best_estimator,
                                 data=dict_test_data, component=comp)

                    print("{:^15} | {:^15} | {:^5} | {:^5} | {:^150} | {:^40} | {:^80}".format(
                        data, estimator, grid,  cv,  str(best_params),
                        "{:<5}-{:>5}".format(round(ttest_scores, 3),
                                             round(std_scores, 3)),
                        "acc:{:<10} - precision:{:<10} - recall {:<10} - f1_score: {:<10}".format(
                            round(po.accuracy_, 4), round(
                                po.precision_, 4), round(po.recall_, 4),
                            round(po.f1_score_, 4))
                    )
                    )

        print("-"*320)
        if str(stop) in comp:
            break

In [14]:

display_results(DATA, stop = 10) # The code works well. Change stop and put one of these [10,15,20,25,30,35,40,45,50] and get results 


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     DATA       |   Estimators    | Grid  |  CV   |                                                                      best_params                                                                       |            Train scores (acc)            |                                   Test scores                                   
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  datacomp10    |       LR   