In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score

import mlflow
from mlflow import MlflowClient

In [2]:
X, y = fetch_covtype(as_frame=True, return_X_y=True)

In [3]:
# dataset is highly unbalanced
y.value_counts()

2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: Cover_Type, dtype: int64

In [4]:
# splitting into train and test using a stratified split 
# and 80-20 train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y)

In [5]:
# there are no missing values anywhere in the train dataset
X_train.isna().sum().sum()

0

In [6]:
# Describing the index of numerical and categorical variables
num_vars = X.columns[:10]
cat_vars = X.columns[10:]

In [7]:
# With no preprocessing, we will use the hist gradient boosting
# # classifier, macro f1 score, and declaration of categorical columns.
clf = HistGradientBoostingClassifier(
    categorical_features=X.columns.isin(cat_vars),
    scoring="f1_macro")

In [8]:

def run_experiment():
    cv_averages = []
    cv_scores = []
    for i in range(5):
        scores = cross_val_score(
            X=X_train, y=y_train, estimator=clf, scoring="f1_macro")
        cv_averages.append(scores.mean())
        cv_scores.append(scores)
    return cv_scores, cv_averages

In [9]:
cv_scores, cv_averages = run_experiment()

In [10]:

print(cv_scores, "\n")
print(cv_averages, "\n")
print("Average score of 5 rounds of cv training: ", pd.Series(cv_averages).mean())

[array([0.78188437, 0.80132189, 0.76507944, 0.77617789, 0.79542934]), array([0.80110331, 0.78870969, 0.79602936, 0.80032378, 0.79809197]), array([0.79716197, 0.72159867, 0.80388484, 0.78116982, 0.79176789]), array([0.78644979, 0.7867932 , 0.80638133, 0.79379137, 0.80154862]), array([0.77333496, 0.79509077, 0.80085506, 0.80531761, 0.8022977 ])] 

[0.7839785860179983, 0.7968516214772802, 0.7791166374545603, 0.7949928603866416, 0.7953792211966233] 

Average score of 5 rounds of cv training:  0.7900637853066208
