In [2]:
# training the chosen final model on the full training set and seeing how well it scores
# on the test set

# I should have pickled the best model parameters and loaded it here, but I do not want to 
# run the grid search again on my computer so I am just going to recreate the model.

In [5]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_covtype
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, multilabel_confusion_matrix

In [6]:
X, y = fetch_covtype(return_X_y=True, as_frame=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [8]:
num_vars = X.columns[:10]
cat_vars = X.columns[10:]

In [9]:
num_pipe = Pipeline([("DimReduction", PCA(n_components=8))])

In [10]:
ct = ColumnTransformer([
    ("num_pipe", num_pipe, num_vars)], remainder="passthrough")

clf = HistGradientBoostingClassifier(
    categorical_features=np.concatenate([
        np.zeros(8, dtype=bool), np.ones(44, dtype=bool)]),
    scoring="f1_macro"
)

pipe = Pipeline([("ct", ct), ("clf", clf)])

In [11]:
pipe

In [12]:
pipe.fit(X_train, y_train)

In [13]:
y_pred = pipe.predict(X_test)

In [14]:
f1_score(y_test, y_pred, average='macro')

0.8168529039841179

In [15]:
accuracy_score(y_test, y_pred)

0.8564064611068561

In [18]:
# The model achieves an f1 score of over 81.6% and an accuracy score of 85.6%.

In [21]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[ 67968,   5867],
        [  7757,  34611]],

       [[ 50923,   8619],
        [  6292,  50369]],

       [[108139,    913],
        [   626,   6525]],

       [[115446,    208],
        [   155,    394]],

       [[114033,    271],
        [   727,   1172]],

       [[112190,    540],
        [   694,   2779]],

       [[111833,    268],
        [   435,   3667]]])

In [22]:
# Some classes have higher accuracy than others, but the amount of type 1 and type 2 
# errors for each class are at least relatively similar, which is the goal of optimizing 
# for f1