# HyperbandSearchCV for hyperparameter tuning

In [None]:
import numpy as np
import pandas as pd
import os
import xgboost as xgb
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, roc_curve
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV
import matplotlib as mpl
import matplotlib.pyplot as plt
from copy import deepcopy
import time
from sklearn.multioutput import MultiOutputClassifier

mpl.rcParams['figure.dpi'] = 200

In [None]:
clf_xgb = xgb.XGBClassifier(objective="binary:logistic", random_state=42, tree_method = "gpu_hist")

In [None]:
clf_xgb.load_model("/kaggle/input/xgbdata/xgboost-full-2905-5pm.json")

In [None]:
Y = np.load("/kaggle/input/xgbdata/Y_1499.npy")

In [None]:
labels_to_consider = np.load("/kaggle/input/xgbdata/Y_1499_labels.npy")

In [None]:
fn = '/kaggle/input/t5embeds/train_embeds.npy'

In [None]:
print(fn)
X = np.load(fn)
print(X.shape)
X

In [None]:
Y_pred_probabilities = np.load('/kaggle/input/cafa-5-submission/xgb-prediction-full-probabilities-06-06.npy')

In [None]:
IX = np.arange(len(X))
print(IX.shape)
print(IX)
IX_train, IX_test, _,_ = train_test_split( IX, IX, train_size=0.1, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = X[IX_train[:10000],:], X[IX_test[:10000],:], Y[IX_train[:10000],:], Y[IX_test[:10000],:]

## Hyperband Search CV implemnation
#### Code used from the following repo - [https://github.com/thuijskens/scikit-hyperband](http://)

HyperBandSearchCV uses scikitlearn's BaseSearchCV as the base class  
HyperBandSearchCV2 uses scikitlearn's GridSearchCV as the base class

In [None]:
import copy

import numpy as np
from scipy.stats import rankdata

from sklearn.utils import check_random_state
from sklearn.model_selection._search import BaseSearchCV, ParameterSampler


__all__ = ['HyperbandSearchCV']


class HyperbandSearchCV(BaseSearchCV):
    def __init__(self, estimator, param_distributions,
                 resource_param='n_estimators', eta=3, min_iter=1,
                 max_iter=81, skip_last=0, scoring=None, n_jobs=1,
                 refit=True, cv=None,
                 verbose=0, pre_dispatch='2*n_jobs', random_state=None,
                 error_score='raise', return_train_score=False):
        self.param_distributions = param_distributions
        self.resource_param = resource_param
        self.eta = eta
        self.min_iter = min_iter
        self.max_iter = max_iter
        self.skip_last = skip_last
        self.random_state = random_state

        super(HyperbandSearchCV, self).__init__(
            estimator=estimator, scoring=scoring, n_jobs=n_jobs,
            refit=refit, cv=cv, verbose=verbose,
            pre_dispatch=pre_dispatch, error_score=error_score,
            return_train_score=return_train_score)

    def _run_search(self, evaluate_candidates):
        self._validate_input()

        s_max = int(np.floor(np.log(self.max_iter / self.min_iter) / np.log(self.eta)))
        B = (s_max + 1) * self.max_iter

        refit_metric = 'score'
        random_state = check_random_state(self.random_state)

        if self.skip_last > s_max:
            raise ValueError('skip_last is higher than the total number of rounds')

        for round_index, s in enumerate(reversed(range(s_max + 1))):
            n = int(np.ceil(int(B / self.max_iter / (s + 1)) * np.power(self.eta, s)))

            # initial number of iterations per config
            r = self.max_iter / np.power(self.eta, s)
            configurations = list(ParameterSampler(param_distributions=self.param_distributions,
                                                   n_iter=n,
                                                   random_state=random_state))

            if self.verbose > 0:
                print('Starting bracket {0} (out of {1}) of hyperband'
                      .format(round_index + 1, s_max + 1))

            for i in range((s + 1) - self.skip_last):

                n_configs = np.floor(n / np.power(self.eta, i))  # n_i
                n_iterations = int(r * np.power(self.eta, i))  # r_i
                n_to_keep = int(np.floor(n_configs / self.eta))

                if self.verbose > 0:
                    msg = ('Starting successive halving iteration {0} out of'
                           ' {1}. Fitting {2} configurations, with'
                           ' resource_param {3} set to {4}')

                    if n_to_keep > 0:
                        msg += ', and keeping the best {5} configurations.'

                    msg = msg.format(i + 1, s + 1, len(configurations),
                                     self.resource_param, n_iterations,
                                     n_to_keep)
                    print(msg)

                # Set the cost parameter for every configuration
                parameters = copy.deepcopy(configurations)
                for configuration in parameters:
                    configuration[self.resource_param] = n_iterations

                results = evaluate_candidates(parameters)

                if n_to_keep > 0:
                    top_configurations = [x for _, x in sorted(zip(results['rank_test_%s' % refit_metric],
                                                                   results['params']),
                                                               key=lambda x: x[0])]

                    configurations = top_configurations[:n_to_keep]

            if self.skip_last > 0:
                print('Skipping the last {0} successive halving iterations'
                      .format(self.skip_last))

    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
t
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        super().fit(X=X, y=y, groups=groups, **fit_params)

        s_max = int(np.floor(np.log(self.max_iter / self.min_iter) / np.log(self.eta)))
        B = (s_max + 1) * self.max_iter

        brackets = []
        for round_index, s in enumerate(reversed(range(s_max + 1))):
            n = int(np.ceil(int(B / self.max_iter / (s + 1)) * np.power(self.eta, s)))
            n_configs = int(sum([np.floor(n / np.power(self.eta, i))
                                 for i in range((s + 1) - self.skip_last)]))
            bracket = (round_index + 1) * np.ones(n_configs)
            brackets.append(bracket)

        self.cv_results_['hyperband_bracket'] = np.hstack(brackets)

        return self

    def _validate_input(self):
        if not isinstance(self.min_iter, int) or self.min_iter <= 0:
            raise ValueError('min_iter should be a positive integer, got %s' %
                             self.min_iter)

        if not isinstance(self.max_iter, int) or self.max_iter <= 0:
            raise ValueError('max_iter should be a positive integer, got %s' %
                             self.max_iter)

        if self.max_iter < self.min_iter:
            raise ValueError('max_iter should be bigger than min_iter, got'
                             'max_iter=%d and min_iter=%d' % (self.max_iter,
                                                              self.min_iter))

        if not isinstance(self.skip_last, int) or self.skip_last < 0:
            raise ValueError('skip_last should be an integer, got %s' %
                             self.skip_last)

        if not isinstance(self.eta, int) or not self.eta > 1:
            raise ValueError('eta should be a positive integer, got %s' %
                             self.eta)

        if self.resource_param not in self.estimator.get_params().keys():
            raise ValueError('resource_param is set to %s, but base_estimator %s '
                             'does not have a parameter with that name' %
                             (self.resource_param,
                              self.estimator.__class__.__name__))


In [None]:
param_grid = {'gamma': [0,6.4,25.6],
              'learning_rate': [0.03, 0.3, 1],
              'max_depth': [3,6,12],
              'n_estimators': [50,100,150],
              'reg_alpha': [0,6.4,25.6],
              'reg_lambda': [0,6.4,25.6]}

search0 = HyperbandSearchCV(estimator=clf_xgb,
                            param_distributions = param_grid,
                            resource_param='n_estimators',
                            scoring='roc_auc',
                           return_train_score=True,
                           verbose=1,
                           cv=3)
search0.fit(X_train,y_train)
print(search0.best_params_)

In [None]:
import copy

import numpy as np
from scipy.stats import rankdata

from sklearn.utils import check_random_state
from sklearn.model_selection._search import BaseSearchCV, ParameterSampler


__all__ = ['HyperbandSearchCV']


class HyperbandSearchCV2(GridSearchCV):
    def __init__(self, estimator, param_distributions,
                 resource_param='n_estimators', eta=3, min_iter=1,
                 max_iter=81, skip_last=0, scoring=None, n_jobs=1,
                 refit=True, cv=None,
                 verbose=0, pre_dispatch='2*n_jobs', random_state=None,
                 error_score='raise', return_train_score=False):
        self.param_distributions = param_distributions
        self.resource_param = resource_param
        self.eta = eta
        self.min_iter = min_iter
        self.max_iter = max_iter
        self.skip_last = skip_last
        self.random_state = random_state

        super(HyperbandSearchCV2, self).__init__(
            estimator=estimator,param_grid=param_distributions, scoring=scoring, n_jobs=n_jobs,
            refit=refit, cv=cv, verbose=verbose,
            pre_dispatch=pre_dispatch, error_score=error_score,
            return_train_score=return_train_score)

    def _run_search(self, evaluate_candidates):
        self._validate_input()

        s_max = int(np.floor(np.log(self.max_iter / self.min_iter) / np.log(self.eta)))
        B = (s_max + 1) * self.max_iter

        refit_metric = 'score'
        random_state = check_random_state(self.random_state)

        if self.skip_last > s_max:
            raise ValueError('skip_last is higher than the total number of rounds')

        for round_index, s in enumerate(reversed(range(s_max + 1))):
            n = int(np.ceil(int(B / self.max_iter / (s + 1)) * np.power(self.eta, s)))

            # initial number of iterations per config
            r = self.max_iter / np.power(self.eta, s)
            configurations = list(ParameterSampler(param_distributions=self.param_distributions,
                                                   n_iter=n,
                                                   random_state=random_state))

            if self.verbose > 0:
                print('Starting bracket {0} (out of {1}) of hyperband'
                      .format(round_index + 1, s_max + 1))

            for i in range((s + 1) - self.skip_last):

                n_configs = np.floor(n / np.power(self.eta, i))  # n_i
                n_iterations = int(r * np.power(self.eta, i))  # r_i
                n_to_keep = int(np.floor(n_configs / self.eta))

                if self.verbose > 0:
                    msg = ('Starting successive halving iteration {0} out of'
                           ' {1}. Fitting {2} configurations, with'
                           ' resource_param {3} set to {4}')

                    if n_to_keep > 0:
                        msg += ', and keeping the best {5} configurations.'

                    msg = msg.format(i + 1, s + 1, len(configurations),
                                     self.resource_param, n_iterations,
                                     n_to_keep)
                    print(msg)

                # Set the cost parameter for every configuration
                parameters = copy.deepcopy(configurations)
                for configuration in parameters:
                    configuration[self.resource_param] = n_iterations

                results = evaluate_candidates(parameters)

                if n_to_keep > 0:
                    top_configurations = [x for _, x in sorted(zip(results['rank_test_%s' % refit_metric],
                                                                   results['params']),
                                                               key=lambda x: x[0])]

                    configurations = top_configurations[:n_to_keep]

            if self.skip_last > 0:
                print('Skipping the last {0} successive halving iterations'
                      .format(self.skip_last))

    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
t
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        print(f"{y}")
        super().fit(X=X, y=y, groups=groups, **fit_params)

        s_max = int(np.floor(np.log(self.max_iter / self.min_iter) / np.log(self.eta)))
        B = (s_max + 1) * self.max_iter

        brackets = []
        for round_index, s in enumerate(reversed(range(s_max + 1))):
            n = int(np.ceil(int(B / self.max_iter / (s + 1)) * np.power(self.eta, s)))
            n_configs = int(sum([np.floor(n / np.power(self.eta, i))
                                 for i in range((s + 1) - self.skip_last)]))
            bracket = (round_index + 1) * np.ones(n_configs)
            brackets.append(bracket)

        self.cv_results_['hyperband_bracket'] = np.hstack(brackets)

        return self

    def _validate_input(self):
        if not isinstance(self.min_iter, int) or self.min_iter <= 0:
            raise ValueError('min_iter should be a positive integer, got %s' %
                             self.min_iter)

        if not isinstance(self.max_iter, int) or self.max_iter <= 0:
            raise ValueError('max_iter should be a positive integer, got %s' %
                             self.max_iter)

        if self.max_iter < self.min_iter:
            raise ValueError('max_iter should be bigger than min_iter, got'
                             'max_iter=%d and min_iter=%d' % (self.max_iter,
                                                              self.min_iter))

        if not isinstance(self.skip_last, int) or self.skip_last < 0:
            raise ValueError('skip_last should be an integer, got %s' %
                             self.skip_last)

        if not isinstance(self.eta, int) or not self.eta > 1:
            raise ValueError('eta should be a positive integer, got %s' %
                             self.eta)

        if self.resource_param not in self.estimator.get_params().keys():
            raise ValueError('resource_param is set to %s, but base_estimator %s '
                             'does not have a parameter with that name' %
                             (self.resource_param,
                              self.estimator.__class__.__name__))


In [None]:
param_grid = {'gamma': [0,6.4,25.6],
              'learning_rate': [0.03, 0.3, 1],
              'max_depth': [3,6,12],
              'n_estimators': [50,100,150],
              'reg_alpha': [0,6.4,25.6],
              'reg_lambda': [0,6.4,25.6]}

gridsearch0 = HyperbandSearchCV2(estimator=clf_xgb,
                            param_distributions = param_grid,
                            resource_param='n_estimators',
                            scoring='accuracy',
                           return_train_score=True,
                           verbose=1)
gridsearch0.fit(X_train,y_train)
print(gridsearch0.best_params_)

In [None]:
gridsearch0.best_params_

In [None]:
gridsearch0.best_params_

In [None]:
results_df = pd.DataFrame(gridsearch0.cv_results_)

In [None]:
results_df

In [None]:
results_df[(results_df['param_reg_lambda'] == 6.4) & (results_df['param_reg_alpha'] == 25.6) & (results_df['param_n_estimators'] == 1)]

In [None]:
train_predictions = gridsearch0.predict(X_train,y_train)
test_predicition = gridsearch0.predict(X_test,y_test)