In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [2]:
X, y = fetch_covtype(as_frame=True, return_X_y=True)

In [3]:
# splitting into train and test using a stratified split 
# and 80-20 train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y)

In [4]:
# Describing the index of numerical and categorical variables
num_vars = X.columns[:10]
cat_vars = X.columns[10:]

In [5]:
# Standard Scaling the numerical variables
ct = ColumnTransformer(
    [("num_transforms", StandardScaler(), num_vars)],
    remainder="passthrough"
)

# clf is HistGradientBoostingClassifier with f1 scoring function
clf = HistGradientBoostingClassifier(
    categorical_features=X.columns.isin(cat_vars),
    scoring="f1_macro")

# Combining the two steps into a pipeline
pipe = Pipeline([("ct",ct), ("clf", clf)])

In [6]:
def run_experiment():
    cv_averages = []
    cv_scores = []
    for i in range(5):
        scores = cross_val_score(
            X=X_train, y=y_train, estimator=pipe, scoring="f1_macro")
        cv_averages.append(scores.mean())
        cv_scores.append(scores)
    return cv_scores, cv_averages

In [7]:
cv_scores, cv_averages = run_experiment()

In [8]:
print(cv_scores, "\n")
print(cv_averages, "\n")
print("Average score of 5 rounds of cv training: ", pd.Series(cv_averages).mean())

[array([0.79252341, 0.76266617, 0.77632387, 0.80449326, 0.78665425]), array([0.79931958, 0.79024963, 0.81590209, 0.76501809, 0.80516795]), array([0.80715163, 0.79744009, 0.76939637, 0.79065708, 0.70418191]), array([0.79353929, 0.775817  , 0.7890802 , 0.80506303, 0.76791821]), array([0.77130779, 0.72049248, 0.7618449 , 0.80157026, 0.78085121])] 

[0.7845321899106545, 0.7951314695949857, 0.7737654180435584, 0.7862835453658885, 0.7672133266200335] 

Average score of 5 rounds of cv training:  0.7813851899070242


In [1]:
# On average, standard scaling the numerical features resulted in an average decrease in f1
# score of 0.9.  However, I still want to see see how standard scaling effects dimension 
# reduction.