In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [2]:
X, y = fetch_covtype(as_frame=True, return_X_y=True)

In [3]:
# splitting into train and test using a stratified split 
# and 80-20 train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y)

In [4]:
# Describing the index of numerical and categorical variables
num_vars = X.columns[:10]
cat_vars = X.columns[10:]

In [5]:
# Standard Scaling the numerical variables
ct = ColumnTransformer(
    [("Scaler", StandardScaler(), num_vars)],
    remainder="passthrough"
)

# clf is HistGradientBoostingClassifier with f1 scoring function
clf = HistGradientBoostingClassifier(
    categorical_features=X.columns.isin(cat_vars),
    scoring="f1_macro")

# Combining the two steps into a pipeline
pipe = Pipeline([("ct",ct), ("clf", clf)])

In [6]:
param_grid = {"ct__Scaler":[StandardScaler(), MinMaxScaler()]}

In [12]:
def run_experiment():    

    mean_scores = []
    for i in range(5):
        output = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring="f1_macro")
        output.fit(X_train, y_train)
        results = output.cv_results_
        mean_scores.append(results["mean_test_score"])
    return mean_scores, results

In [13]:
mean_scores, results = run_experiment()

In [27]:
mean_scores = pd.DataFrame(mean_scores, columns=["StandardScaler", "MinMaxScaler"])

In [32]:
mean_scores

Unnamed: 0,StandardScaler,MinMaxScaler
0,0.797355,0.789224
1,0.783729,0.791003
2,0.793423,0.803929
3,0.801027,0.795642
4,0.78734,0.791769


In [29]:
mean_scores.mean()

StandardScaler    0.792575
MinMaxScaler      0.794313
dtype: float64

In [33]:
# An experiment with five trials of gridsearchcv shows that MinMaxScaler is a tiny more 
# efficient than StandardScaler, and both are a tiny bit more accurate than no scaling at
# all.  However, I will have to test both techniques in combination with feature
# reduction techniques to truly see which is better.