In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

adult_census = pd.read_csv("../data/adult-census-numeric-all.csv")

target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.2, random_state=42
)

In [2]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)
preprocessor = make_column_transformer(
    (categorical_preprocessor, selector(dtype_include=object)),
    remainder="passthrough",
)

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", HistGradientBoostingClassifier(random_state=42)),
    ]
)

In [3]:
from sklearn.model_selection import cross_val_score

learning_rates = [0.01, 0.1, 1.0]
max_leaf_values = [3, 10, 30]

best_score = 0.0
best_params = {}

for lr in learning_rates:
    for max_leaf in max_leaf_values:
        model.set_params(
            classifier__learning_rate=lr,
            classifier__max_leaf_nodes=max_leaf,
        )
        scores = cross_val_score(model, data_train, target_train, cv=5, n_jobs=2)
        mean_score = scores.mean()

        if mean_score > best_score:
            best_score = mean_score
            best_params = {
                "learning_rate": lr,
                "max_leaf_nodes": max_leaf,
            }

print("Best params on train set:", best_params)
print(f"Best mean CV accuracy: {best_score:.3f}")

Best params on train set: {'learning_rate': 0.1, 'max_leaf_nodes': 10}
Best mean CV accuracy: 0.821


In [4]:
model.set_params(
    classifier__learning_rate=best_params["learning_rate"],
    classifier__max_leaf_nodes=best_params["max_leaf_nodes"],
)
model.fit(data_train, target_train)

test_score = model.score(data_test, target_test)
print(f"Test accuracy with best parameters: {test_score:.3f}")

Test accuracy with best parameters: 0.829
