# Set and get hyperparameters (v2)
> The process of learning a predictive model is driven by a set of internal parameters and a set of training data. These internal parameters are called hyperparameters and are specific for each family of models. In addition, a specific set of hyperparameters are optimal for a specific dataset and thus they need to be optimized.
- toc: true
- badges: false
- comments: true
- author: Cécile Gallioz
- categories: [sklearn, v2]

# Loading

In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import time

In [2]:
myData = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")

In [3]:
myData = myData.drop(columns="education-num")

In [4]:
print(f"The dataset data contains {myData.shape[0]} samples and {myData.shape[1]} features")

The dataset data contains 48842 samples and 13 features


In [5]:
target_column = 'class'
target = myData[target_column]
data = myData.drop(columns=target_column)

In [6]:
from sklearn.compose import make_column_selector as selector
# 
numerical_columns = selector(dtype_exclude=object)(data)
categorical_columns = selector(dtype_include=object)(data)
all_columns = numerical_columns + categorical_columns
data = data[all_columns]

In [7]:
data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]

# On default value

In [8]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
#

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns)],
    remainder="passthrough")

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4))])

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

cv_results = cross_validate(model, data, target, cv=cv, return_train_score=True)

scores = cv_results["test_score"]
train_scores = cv_results["train_score"]
fit_time = cv_results["fit_time"]

print("The accuracy in TRAIN is "
      f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
print("The accuracy in TEST  is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")

The accuracy in TRAIN is 0.864 +/- 0.001
The accuracy in TEST  is 0.862 +/- 0.001, for 0.512 seconds


# Put hyperparameter ourself

In [9]:
for parameter in model.get_params():
    print(parameter)

memory
steps
verbose
preprocessor
classifier
preprocessor__n_jobs
preprocessor__remainder
preprocessor__sparse_threshold
preprocessor__transformer_weights
preprocessor__transformers
preprocessor__verbose
preprocessor__verbose_feature_names_out
preprocessor__categorical
preprocessor__categorical__categories
preprocessor__categorical__dtype
preprocessor__categorical__handle_unknown
preprocessor__categorical__unknown_value
classifier__categorical_features
classifier__early_stopping
classifier__l2_regularization
classifier__learning_rate
classifier__loss
classifier__max_bins
classifier__max_depth
classifier__max_iter
classifier__max_leaf_nodes
classifier__min_samples_leaf
classifier__monotonic_cst
classifier__n_iter_no_change
classifier__random_state
classifier__scoring
classifier__tol
classifier__validation_fraction
classifier__verbose
classifier__warm_start


In [10]:
model.set_params(classifier__max_leaf_nodes=2);

cv_results = cross_validate(model, data, target, cv=cv, return_train_score=True)

scores = cv_results["test_score"]
train_scores = cv_results["train_score"]
fit_time = cv_results["fit_time"]

print(f"Accuracy score via cross-validation with mln={model.get_params()['classifier__max_leaf_nodes']}:")
print("The accuracy in TRAIN is "
  f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
print("The accuracy in TEST  is "
  f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds\n")

Accuracy score via cross-validation with mln=2:
The accuracy in TRAIN is 0.826 +/- 0.001
The accuracy in TEST  is 0.825 +/- 0.002, for 0.461 seconds



# Test an hyperparameter manualy

In [11]:
for mln in [3, 10, 30]:
    for lr in [0.01, 0.1, 1, 10]:
        model.set_params(classifier__max_leaf_nodes=mln)
        model.set_params(classifier__learning_rate=lr)

        cv_results = cross_validate(model, data, target, cv=cv, return_train_score=True)

        scores = cv_results["test_score"]
        train_scores = cv_results["train_score"]
        fit_time = cv_results["fit_time"]

        print(f"Accuracy score via cross-validation with mln={mln} and lr={lr}:")
        print("The accuracy in TRAIN is "
          f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
        print("The accuracy in TEST  is "
          f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds\n")

Accuracy score via cross-validation with mln=3 and lr=0.01:
The accuracy in TRAIN is 0.799 +/- 0.001
The accuracy in TEST  is 0.799 +/- 0.001, for 0.534 seconds

Accuracy score via cross-validation with mln=3 and lr=0.1:
The accuracy in TRAIN is 0.857 +/- 0.001
The accuracy in TEST  is 0.856 +/- 0.001, for 0.601 seconds

Accuracy score via cross-validation with mln=3 and lr=1:
The accuracy in TRAIN is 0.865 +/- 0.004
The accuracy in TEST  is 0.862 +/- 0.006, for 0.271 seconds

Accuracy score via cross-validation with mln=3 and lr=10:
The accuracy in TRAIN is 0.281 +/- 0.001
The accuracy in TEST  is 0.279 +/- 0.002, for 0.190 seconds

Accuracy score via cross-validation with mln=10 and lr=0.01:
The accuracy in TRAIN is 0.820 +/- 0.001
The accuracy in TEST  is 0.819 +/- 0.002, for 0.808 seconds

Accuracy score via cross-validation with mln=10 and lr=0.1:
The accuracy in TRAIN is 0.873 +/- 0.001
The accuracy in TEST  is 0.870 +/- 0.001, for 0.836 seconds

Accuracy score via cross-validati

# Test an hyperparameter with param_grid

In [12]:
from sklearn.model_selection import GridSearchCV
#

param_grid = {
    'classifier__learning_rate': (0.01, 0.1, 1, 10),
    'classifier__max_leaf_nodes': (3, 10, 30)}

model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=cv)

In [13]:
cv_results = cross_validate(model_grid_search, data, target, 
                            cv=cv, 
                            return_train_score=True, 
                            return_estimator=True)

In [14]:
scores = cv_results["test_score"]
train_scores = cv_results["train_score"]
fit_time = cv_results["fit_time"]
best_estimators = cv_results["estimator"]

print("The accuracy in TRAIN is "
  f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
print("The accuracy in TEST  is "
  f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds\n")

The accuracy in TRAIN is 0.882 +/- 0.001
The accuracy in TEST  is 0.872 +/- 0.001, for 22.876 seconds



In [15]:
for cv_fold, estimator_in_fold in enumerate(best_estimators):
    print(f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}")
    print("The accuracy in TRAIN is "
        f"{train_scores[cv_fold]:.3f}")
    print("The accuracy in TEST  is "
      f"{scores[cv_fold]:.3f}, for {fit_time[cv_fold]:.3f} seconds\n")

Best hyperparameters for fold #1:
{'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
The accuracy in TRAIN is 0.881
The accuracy in TEST  is 0.874, for 23.114 seconds

Best hyperparameters for fold #2:
{'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
The accuracy in TRAIN is 0.882
The accuracy in TEST  is 0.871, for 20.990 seconds

Best hyperparameters for fold #3:
{'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
The accuracy in TRAIN is 0.883
The accuracy in TEST  is 0.870, for 21.488 seconds

Best hyperparameters for fold #4:
{'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
The accuracy in TRAIN is 0.881
The accuracy in TEST  is 0.873, for 21.706 seconds

Best hyperparameters for fold #5:
{'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
The accuracy in TRAIN is 0.882
The accuracy in TEST  is 0.873, for 27.082 seconds



# Randomization

In [16]:
from scipy.stats import loguniform


class loguniform_int:
    """Integer valued version of the log-uniform distribution"""
    def __init__(self, a, b):
        self._distribution = loguniform(a, b)

    def rvs(self, *args, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*args, **kwargs).astype(int)

In [17]:
from sklearn.model_selection import RandomizedSearchCV
#
# param_distributions = {
#     'classifier__l2_regularization': loguniform(1e-6, 1e3),
#     'classifier__learning_rate': loguniform(0.001, 10),
#     'classifier__max_leaf_nodes': loguniform_int(2, 256),
#     'classifier__min_samples_leaf': loguniform_int(1, 100),
#     'classifier__max_bins': loguniform_int(2, 255),
# }

param_distributions = {
    'classifier__learning_rate': loguniform(0.001, 10),
    'classifier__max_leaf_nodes': loguniform_int(2, 256)
}

In [18]:
model_random_search = RandomizedSearchCV(
    model, param_distributions=param_distributions, n_iter=10,
    cv=cv, verbose=1)

In [19]:
cv_results = cross_validate(model_random_search, data, target, 
                            cv=cv, 
                            return_train_score=True, 
                            return_estimator=True)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [20]:
scores = cv_results["test_score"]
train_scores = cv_results["train_score"]
fit_time = cv_results["fit_time"]
best_estimators = cv_results["estimator"]

print("The accuracy in TRAIN is "
  f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
print("The accuracy in TEST  is "
  f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds\n")

The accuracy in TRAIN is 0.879 +/- 0.005
The accuracy in TEST  is 0.871 +/- 0.002, for 59.890 seconds



In [21]:
for cv_fold, estimator_in_fold in enumerate(best_estimators):
    print(f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}")
    print("The accuracy in TRAIN is "
        f"{train_scores[cv_fold]:.3f}")
    print("The accuracy in TEST  is "
      f"{scores[cv_fold]:.3f}, for {fit_time[cv_fold]:.3f} seconds\n")

Best hyperparameters for fold #1:
{'classifier__learning_rate': 0.5360979653669641, 'classifier__max_leaf_nodes': 4}
The accuracy in TRAIN is 0.872
The accuracy in TEST  is 0.869, for 52.325 seconds

Best hyperparameters for fold #2:
{'classifier__learning_rate': 0.10420574471319943, 'classifier__max_leaf_nodes': 23}
The accuracy in TRAIN is 0.880
The accuracy in TEST  is 0.872, for 40.249 seconds

Best hyperparameters for fold #3:
{'classifier__learning_rate': 0.03086058204962357, 'classifier__max_leaf_nodes': 44}
The accuracy in TRAIN is 0.875
The accuracy in TEST  is 0.870, for 52.635 seconds

Best hyperparameters for fold #4:
{'classifier__learning_rate': 0.2072080935893539, 'classifier__max_leaf_nodes': 28}
The accuracy in TRAIN is 0.883
The accuracy in TEST  is 0.874, for 114.106 seconds

Best hyperparameters for fold #5:
{'classifier__learning_rate': 0.31645381205607565, 'classifier__max_leaf_nodes': 33}
The accuracy in TRAIN is 0.885
The accuracy in TEST  is 0.872, for 40.136 s