# 09 XGB Hype Study
Performing a hyperparameter study on XGBoost to understand which hyperparameters have the biggest impact on performance. This notebook runs a whole lot of runs, which can then be analyzed on a parameter by parameter basis.

env: Data Science 3.0

instance: ml.t3.medium

# installing the most up-to-date experiment tracker client
and dependencies

In [2]:
!pip install git+https://github.com/DanielWarfield1/TabularExperimentTrackerClient

Collecting git+https://github.com/DanielWarfield1/TabularExperimentTrackerClient
  Cloning https://github.com/DanielWarfield1/TabularExperimentTrackerClient to /tmp/pip-req-build-gafk3em_
  Running command git clone --filter=blob:none --quiet https://github.com/DanielWarfield1/TabularExperimentTrackerClient /tmp/pip-req-build-gafk3em_
  Resolved https://github.com/DanielWarfield1/TabularExperimentTrackerClient to commit 5e738443d55b9637454776bd740a8083af70272d
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting openml (from TabularExperimentTrackerClient==0.0.1)
  Using cached openml-0.14.1-py3-none-any.whl
Collecting liac-arff>=2.4.0 (from openml->TabularExperimentTrackerClient==0.0.1)
  Using cached liac_arff-2.5.0-py3-none-any.whl
Collecting xmltodict (from openml->TabularExperimentTrackerClient==0.0.1)
  Using cached xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Collecting minio (from openml->TabularExperimentTrackerClient==0.0.1)
  Obtaining dependency information fo

In [3]:
#grabbing credentials from seceret manager
with open ("/root/Credentials/openMLAPIKey.txt", "r") as myfile:
    opml_key = myfile.read()
with open ("/root/Credentials/tabExpTrackAPIKey.txt", "r") as myfile:
    orch_key = myfile.read()

In [4]:
#configuring the experiment tracker
from TabularExperimentTrackerClient.ExperimentClient import ExperimentClient

#creating experiment client utilities
ex = ExperimentClient(verbose = False)
#getting openml credentials from drive
ex.define_opml_cred(opml_key)
#getting orchestration credentials from drive
ex.define_orch_cred('test1', orch_key)

In [8]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/c1/cf/a662bc8f40588d54663edfe12980946670490bff0b6e793c7896a4fe36df/xgboost-2.0.0-py3-none-manylinux2014_x86_64.whl.metadata
  Downloading xgboost-2.0.0-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.0-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.0
[0m

# Defining Hyperparameter Spaces
using hyperparameter space defined in https://arxiv.org/pdf/2207.08815.pdf

all hyperparameter spaces respect the naming convention of the module they will be used with, for convenience sake.

In [9]:
import copy
import numpy as np

#============================================================
# XGBoost
#============================================================

# for https://xgboost.readthedocs.io/en/stable/parameter.html
XGBoost_space = {
    "max_depth": {"distribution": "int_uniform", "min":1, "max":11},
    "n_estimators": {"distribution": "int_uniform", "min":100, "max":200},
    "min_child_weight": {"distribution": "log_uniform", "min":1, "max":1e2},
    "subsample": {"distribution": "float_uniform", "min":0.5, "max":1},
    "colsample_bylevel": {"distribution": "float_uniform", "min":0.5, "max":1},
    "colsample_bytree": {"distribution": "float_uniform", "min":0.5, "max":1},
    "gamma": {"distribution": "log_uniform", "min":1e-8, "max":7},
    "reg_lambda": {"distribution": "log_uniform", "min":1, "max":4},
    "reg_alpha": {"distribution": "log_uniform", "min":1e-8, "max":1e2},
}


spaces = {
    'XGBoostClass_space': XGBoost_space,
    'XGBoostReg_space': XGBoost_space,
}

# Visualizing Hyperparameter Spaces
There's a lot of hyperparameter spaces, with a lot of room for mistakes. This allows me to, at a glance, visualize/read examples of hyperparameter points within each space.

this is mostly to make sure I didn't accidentally make a typo.

In [10]:
import pandas as pd
import matplotlib.pyplot as plt

"""Defining general visualization funciton
plot all hyperparameter distributions for all hyperparameters
"""
def vis(df, title, print_only=True):

    print('========== {} =========='.format(title))
    print(df.head())

    if print_only:
        return

    fig, axs = plt.subplots(len(df.columns)-1, 1)
    for col, ax in zip(df.columns[1:], axs):
        ax.hist(df[col], bins=50)
        ax.set_ylabel(col)

    fig.set_size_inches(18.5, 20)
    fig.suptitle(title)
    plt.show()

#monte carlo sampled hyperparameters for each range
space_samples = {}

#visualizing all spaces
if True:
    for k, v in spaces.items():
        space_samples[k] = pd.DataFrame(ex.monte_carlo_sample_space(v))
        vis(space_samples[k], k)

   max_depth  n_estimators  min_child_weight  subsample  colsample_bylevel  \
0          8           189         45.354477   0.766593           0.978090   
1          6           191          3.042358   0.866723           0.518982   
2         11           169          6.795378   0.514475           0.596082   
3          6           155         31.290588   0.717686           0.743288   
4         10           171          4.006535   0.979880           0.690904   

   colsample_bytree         gamma  reg_lambda  reg_alpha  
0          0.601803  7.759480e-01    1.678293  26.532263  
1          0.670071  5.962196e+00    2.773802   0.000899  
2          0.640992  5.475179e-03    1.107315   0.578574  
3          0.542234  1.125327e-04    2.760480   0.003530  
4          0.762981  2.944938e-07    1.727320   0.004412  
   max_depth  n_estimators  min_child_weight  subsample  colsample_bylevel  \
0          7           192          9.439790   0.906720           0.946163   
1          4         

# Defining model initialization
now that I have random hyperparameter points, I'm defining functions which turn those hyperparameter points into actual initialized models. I can then iterate over the sampled hyperparameter points to ensure that model initialization is working correctly.

In [12]:
"""Defining XGBoost initializer function
"""

from xgboost import XGBRegressor, XGBClassifier

def init_XGB(hype, isClass):
    if isClass:
        return XGBClassifier(**hype)
    else:
        return XGBRegressor(**hype)

#test initializing
if True:
    [init_XGB(hype, True) for hype in space_samples['XGBoostClass_space'].to_dict('records')]
    [init_XGB(hype, False) for hype in space_samples['XGBoostReg_space'].to_dict('records')]
    print('XGBoost defined successfully')

XGBoost defined successfully


# Defining Model Groups
A "model_group" associates models with hyperparameter spaces. The key to this dictionary is what is used in model-task applications, the model is a string which tells the orchestrator what model to build, and the hype is the hyperparameter space.

In [13]:
model_groups = {
    'XGBoost_class':{'model': 'XGBoost_class', 'hype': XGBoost_space },
    'XGBoost_reg':{'model': 'XGBoost_reg', 'hype': XGBoost_space },
    }

ex.def_model_groups(model_groups)

# Defining Data Groups
using an autmated function to define the data groups. pure numeric, and numeric/class features of classification and regression tasks.

In [14]:
ex.def_data_groups_opml()
print('automatically defined data groups:')
print(ex.data_groups.keys())

automatically defined data groups:
dict_keys(['opml_reg_purnum_group', 'opml_class_purnum_group', 'opml_reg_numcat_group', 'opml_class_numcat_group'])


# Defining Applications

In [15]:
classification_models = [k for k in model_groups.keys() if '_class' in k]
regression_models = [k for k in model_groups.keys() if '_reg' in k]

#classification models getting applied to classification tasks,
#and likewise for regression. purnum = pure numeric, numcat = numeric/categorical
applications = {'opml_reg_purnum_group': regression_models,
                'opml_reg_numcat_group': regression_models,
                'opml_class_purnum_group': classification_models,
                'opml_class_numcat_group': classification_models}

ex.def_applications(applications)

# Defining Dataset Preprocessing
Categorical features get one-hot encoded, dataset gets divided into k folds, and a list of X/y pairs gets output. Target gets one hot encoded as well.

Considerations:
 - OneHot might not be the best for all use cases
 - high cardinality categorical features should probably be filtered
 - some models (like catboost) need raw categorical features
 - should probably do some type of balancing/normalizing

for now it should be fine though.

In [16]:
!pip install git+https://github.com/Bartosz-G/NeuralNetworksTrainingPackage

Collecting git+https://github.com/Bartosz-G/NeuralNetworksTrainingPackage
  Cloning https://github.com/Bartosz-G/NeuralNetworksTrainingPackage to /tmp/pip-req-build-un7as8y_
  Running command git clone --filter=blob:none --quiet https://github.com/Bartosz-G/NeuralNetworksTrainingPackage /tmp/pip-req-build-un7as8y_
  Resolved https://github.com/Bartosz-G/NeuralNetworksTrainingPackage to commit 2eff78e8c87ac0bc780226edc9f1b16ac8629ca6
  Preparing metadata (setup.py) ... [?25ldone
Collecting torch (from NeuralNetworksTrainingPackage==1.0.0)
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/6d/13/b5e8bacd980b2195f8a1741ce11cbb9146568607795d5e4ff510dcff1064/torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata
  Downloading torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torcheval (from NeuralNetworksTrainingPackage==1.0.0)
  Obtaining dependency information for torcheval from https://files.pythonhosted.org/packages/e4/de/e

In [17]:

from NeuralNetworksTrainingPackage.event_handler import dataPreProcessingEventEmitter
from NeuralNetworksTrainingPackage.dataprocessing.basic_pre_processing import filterCardinality, quantileTransform, truncateData ,balancedTruncateData, oneHotEncodePredictors, oneHotEncodeTargets, toDataFrame, splitTrainValTest, balancedSplitTrainValTest, labelEncodeTargets

n_sample = 20000
split = [0.5, 0.25, 0.25]
quantile_transform_distribution='normal'

exclude_models = ('CatBoost_reg', 'CatBoost_class')


data_pre_processing = dataPreProcessingEventEmitter()

filter_cardinality = filterCardinality(transform = 'all')
truncate_data = truncateData(n = n_sample, transform = 'all')
balanced_truncate_data = balancedTruncateData(n = n_sample, transform = 'all') # Ensures balance of classes
one_hot_encode_predictors = oneHotEncodePredictors(transform = 'all')
one_hot_encode_targets = oneHotEncodeTargets(transform = 'all')
label_encode_targets = labelEncodeTargets(transform = 'all')
to_data_frame = toDataFrame(transform = 'all')
split_train_val_test = splitTrainValTest(split = split)
balanced_split_train_val_test = balancedSplitTrainValTest(split = split)
quantile_transform = quantileTransform(output_distribution = quantile_transform_distribution, transform = 'all')


# Transformations will be called in the order they're added to data_pre_processing
data_pre_processing.add_pre_processing_step('regression', filter_cardinality)
data_pre_processing.add_pre_processing_step('regression', truncate_data)
data_pre_processing.add_pre_processing_step('regression', one_hot_encode_predictors)
# data_pre_processing.add_pre_processing_step('regression', to_data_frame)
data_pre_processing.add_pre_processing_step('regression', split_train_val_test)
data_pre_processing.add_pre_processing_step('regression', quantile_transform)


data_pre_processing.add_pre_processing_step('classification', filter_cardinality)
data_pre_processing.add_pre_processing_step('classification', balanced_truncate_data)
data_pre_processing.add_pre_processing_step('classification', one_hot_encode_predictors)
data_pre_processing.add_pre_processing_step('classification', label_encode_targets) # Added this for you, as I can see you're label encoding targets
# data_pre_processing.add_pre_processing_step('classification', one_hot_encode_targets)
# data_pre_processing.add_pre_processing_step('classification', to_data_frame)
data_pre_processing.add_pre_processing_step('classification', balanced_split_train_val_test)
data_pre_processing.add_pre_processing_step('classification', quantile_transform)


# Required to distinguish between classification and regression tasks
regression_tasks = ex.__dict__['data_groups']['opml_reg_purnum_group'] + ex.__dict__['data_groups']['opml_reg_numcat_group']
# ======================== Added: Bart ==================================

In [18]:
# ======================== Added: Bart ==================================
data_pre_processing.add_pre_processing_step('CatBoost_reg', filter_cardinality)
data_pre_processing.add_pre_processing_step('CatBoost_reg', truncate_data)
# data_pre_processing.add_pre_processing_step('CatBoost_reg', one_hot_encode_predictors)
# data_pre_processing.add_pre_processing_step('CatBoost_reg', to_data_frame)
data_pre_processing.add_pre_processing_step('CatBoost_reg', split_train_val_test)
data_pre_processing.add_pre_processing_step('CatBoost_reg', quantile_transform)


data_pre_processing.add_pre_processing_step('CatBoost_class', filter_cardinality)
data_pre_processing.add_pre_processing_step('CatBoost_class', balanced_truncate_data)
# data_pre_processing.add_pre_processing_step('CatBoost_class'', one_hot_encode_predictors)
# data_pre_processing.add_pre_processing_step('CatBoost_class', one_hot_encode_targets)
# data_pre_processing.add_pre_processing_step('CatBoost_class', to_data_frame)
data_pre_processing.add_pre_processing_step('CatBoost_class', label_encode_targets) # Added this for you, as I can see you're label encoding targets
data_pre_processing.add_pre_processing_step('CatBoost_class', balanced_split_train_val_test)
data_pre_processing.add_pre_processing_step('CatBoost_class', quantile_transform)
# ======================== Added: Bart ==================================

# Creating Experiment
using the model groups, data groups, and applications to register a new experiment.

In [19]:
ex.runs_per_pair = 240
ex.reg_experiment('09_XGBHypeStudy')

'new experiment created'

# Metric Calculation
TODO this is getting a bit hacky, and I'm starting to get hurt by ambiguous types

In [20]:
#classification metrics
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_auc_score
#regression metrics
from sklearn.metrics import r2_score, mean_squared_error

def calc_metrics(y, yhat, is_categorical, yhat_proba=None):

    metrics = {}
    if is_categorical:
        metrics['accuracy_score'] = float(accuracy_score(y, yhat))
        if len(y.shape) > 1:
            y = np.argmax(y.values, axis=1)
        if len(yhat.shape) > 1:
            yhat = np.argmax(yhat, axis=1)
        metrics['confusion_matrix'] = [ [float(v) for v in r] for r in confusion_matrix(y, yhat)]
        if yhat_proba is not None:
            metrics['roc_auc_score'] = roc_auc_score(y, yhat_proba)
    else:
        metrics['r2_score'] = float(r2_score(y, yhat))
        metrics['RMSE'] = float(mean_squared_error(y, yhat, squared=False))
        if len(yhat.shape) > 1:
            yhat = yhat[:,0]
        metrics['se_quant'] = ((yhat - y)**2).quantile([0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.975, 0.99]).to_dict()
        for k,v in metrics['se_quant'].items():
            metrics['se_quant'][k] = float(v)

    return metrics

# Running Experiment

In [None]:
from torch.nn.modules.loss import L1Loss
#for suppressing output
import contextlib
import time

exp_info = ex.experiment_info()
successful_runs = exp_info['successful_runs']
required_runs = exp_info['required_runs']
print('total successful runs: {}'.format(successful_runs))
print('total required runs: {}'.format(required_runs))

#iterating over all folds
for i in range(successful_runs, required_runs):
    print('=============== {}/{} ================'.format(i, required_runs))

    run_info = ex.begin_run_sticky()
    # run_info = ex.begin_run()

    print('beginning new run with task {}'.format(run_info['task']))
    print('model: {}'.format(run_info['model']))
    print('Run id: {}'.format(ex.run_id))

    # ======================== Added: Bart ==================================
    seed = 42
    hyperparameters = run_info['hyp']
    model_name = run_info['model']

    np.random.seed(seed)

    print('loading task...')
    X, y, categorical_indicator, attribute_names = ex.opml_load_task(run_info['mtpair_task'])


    if run_info['task'] in regression_tasks:
        task = 'regression'
        is_categorical = False
    else:
        task = 'classification'
        is_categorical = True


    data_pre_processing.set_seed_for_all(seed)
    data_pre_processing.set_dataset(X, y, categorical_indicator, attribute_names)
    if model_name not in exclude_models:
      data_pre_processing.apply(task)
    data_pre_processing.apply(model_name)
    train_data, val_data, test_data = data_pre_processing.get_train_val_test()

    train_x, train_y, train_categorical_indicator, train_attribute_names = train_data
    validate_x, validate_y, validate_categorical_indicator, validate_attribute_names = val_data
    test_x, test_y, test_categorical_indicator, test_attribute_names = test_data
    cat_idx = [i for i, is_categorical in enumerate(train_categorical_indicator) if is_categorical]
    print('task loaded...')

    print('is categorical target? : {}'.format(is_categorical))

    test_yhat = None
    start_time = time.time()

    #creating model, training, and predicting
    m=None
    match run_info['model']:
        case 'XGBoost_class' | 'XGBoost_reg':
            m = init_XGB(run_info['hyp'], is_categorical)
            m.fit(train_x.values, train_y.values)
            train_yhat = m.predict(train_x.values)
            val_yhat = m.predict(validate_x.values)
            test_yhat = m.predict(test_x.values)

    #trying predict_proba
    train_yhat_proba = None
    val_yhat_proba = None
    test_yhat_proba = None
    if is_categorical:
        try:
            train_yhat_proba = m.predict_proba(train_x.values)[:,1]
            val_yhat_proba = m.predict_proba(validate_x.values)[:,1]
            test_yhat_proba = m.predict_proba(test_x.values)[:,1]
        except Exception as e:
            print('probabalistic predictoin failed')
            print('==========')
            print(e)
            print('==========')

    #calculating metrics"
    metrics = {}
    train_metrics = calc_metrics(train_y, train_yhat, is_categorical, train_yhat_proba)
    validate_metrics = calc_metrics(validate_y, val_yhat, is_categorical, val_yhat_proba)
    test_metrics = calc_metrics(test_y, test_yhat, is_categorical, test_yhat_proba)
    metrics['train_metrics'] = train_metrics
    metrics['validate_metrics'] = validate_metrics
    metrics['test_metrics'] = test_metrics
    metrics['epoch_time'] = time.time()-start_time
    print('run results:')
    print(metrics)

    #logging metrics and ending run
    ex.update_run(metrics)
    ex.end_run()

total successful runs: 0
total required runs: 14160
beginning new run with task 336-361086
model: XGBoost_reg
Run id: 6524a0607be46fcabbb0211f
loading task...
task loaded...
is categorical target? : False
run results:
{'train_metrics': {'r2_score': 0.9749039264823616, 'RMSE': 0.08965187337930902, 'se_quant': {0.01: 6.366902542974548e-07, 0.025: 3.965794916052105e-06, 0.05: 1.8471785515990242e-05, 0.1: 7.83074300362918e-05, 0.2: 0.00029702834374801114, 0.3: 0.0006957717563422257, 0.4: 0.001252524882350442, 0.5: 0.0020287295144332104, 0.6: 0.0031705265542628586, 0.7: 0.00477020657540354, 0.8: 0.0076914052999633685, 0.9: 0.014975561569729208, 0.95: 0.02686904627571101, 0.975: 0.04400427586373734, 0.99: 0.09574863428293016}}, 'validate_metrics': {'r2_score': 0.9755848264918484, 'RMSE': 0.08834501135080636, 'se_quant': {0.01: 6.688559763667405e-07, 0.025: 3.6219630276695567e-06, 0.05: 1.5598427881090275e-05, 0.1: 6.076212446491443e-05, 0.2: 0.00025023455936551214, 0.3: 0.0005772996025455745