# Hyperopt playground

Bayesian optimization is a probabilistic model based approach for finding the minimum of any function that returns a real-value metric.<br> This function may be as simple as $f(x) = x^2$, or it can be as complex as the validation error of a deep neural network with respect to hundreds of model architecture and hyperparameter choices.

Turns out that this probabalilistic approach is more efficient than manual, random, or grid search, with regards to:<br>
> Better overall performance on the test set<br>
> Less time required for optimization

Hyperopt is a one of the libraries that allows applications of Bayesian optimization.

Bayesian optimization, optimizes hypothesis, by building a probability model of the objective function that maps input values to a probability of a loss: $P(loss | input \ values)$.<br> 
The probability model, also called the surrogate or response surface, is easier to optimize than the actual objective function.<br>
Bayesian methods select the next values to evaluate by applying a criteria (usually Expected Improvement) to the surrogate.<br>
The concept is to limit evals of the objective function by spending more time choosing the next values to try.

In [1]:
from_colab = {'booster': 'gbtree',
 'colsample_bytree': 0.7000000000000001,
 'eval_metric': 'auc',
 'gamma': 0.0,
 'learning_rate': 0.082,
 'max_depth': 5,
 'min_child_weight': 8.0,
 'n_estimators': 1920,
 'nthread': -1,
 'objective': 'binary:logistic',
 'reg_lambda': 5.5,
 'scale_pos_weight': 1,
 'subsample': 0.708767041292768}

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
import xgboost as xgb

In [None]:
from xgboost import XGBClassifier, DMatrix
from hyperopt import STATUS_OK

from hyperopt import hp, tpe, Trials
from hyperopt.fmin import fmin

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
DATA_PATH = "data_playground/dane_zad1.csv"
SONAR_DATA_PATH = "data_playground/sonar.csv"
FRAUD_DATA_PATH = "data_playground/creditcard.csv"
DIABETES_DATA_PATH = "data_playground/diabetes.csv"

In [None]:
def load_preprocess_data(data_path, encoder=None):
    df = pd.read_csv(data_path)
    data = df.values
    X, y = data[:, :-1], data[:, -1]
    if encoder:
        X, y = encoder(X, y)
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42, stratify=y_train)
    return x_train, x_val, x_test, y_train, y_val, y_test

# Data

In [None]:
def toy_encoder(X, y):
    X = X.astype(float)
    return X, y

# toy dataset
x_train, x_val, x_test, y_train, y_val, y_test = load_preprocess_data(DATA_PATH, encoder=toy_encoder)

In [None]:
def sonar_encoder(X, y):
    X = X.astype(float)
    y = y == y[0]
    return X, y

# sonar dataset
x_train, x_val, x_test, y_train, y_val, y_test = load_preprocess_data(SONAR_DATA_PATH, encoder=sonar_encoder)

In [None]:
# fraud dataset
x_train, x_val, x_test, y_train, y_val, y_test = load_preprocess_data(FRAUD_DATA_PATH)

In [None]:
# diabetes dataset
x_train, x_val, x_test, y_train, y_val, y_test = load_preprocess_data(DIABETES_DATA_PATH)

# Baseline

In [None]:
def transform_params(params):
#     params["gamma"] = np.log(params["gamma"])
#     params["learning_rate"] = np.log(params["learning_rate"])
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    return params

In [None]:
def test_parameters(params):
    params = transform_params(params)
    
    clf_xgb = XGBClassifier(**params)
        
    eval_set  = [( x_train, y_train), (x_val, y_val)]
    
    clf_xgb.fit(x_train, y_train,
            eval_set=eval_set, eval_metric="auc", 
            early_stopping_rounds=50, verbose=False)
    y_prob = clf_xgb.predict_proba(x_test)[:,1]
    
#     fig, ax = plt.subplots(1,1,figsize=(10,10))
#     xgb.plot_importance(clf_xgb, ax=ax)
    
    return roc_auc_score(y_test, y_prob)

In [None]:
current_space = {
    "booster": 'gbtree',       
    "objective": 'binary:logistic',
    "eval_metric": 'auc',
    "nthread": -1,
    "learning_rate": 0.1,
    "n_estimators": 100, 
    "max_depth": 6,
    "min_child_weight": 1,
    "gamma": 0,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "scale_pos_weight": 1,
    'reg_lambda': 1
}

In [None]:
test_parameters(current_space)

## Optimum number of trees

In [None]:
def find_n_estimators(params, cv_folds=5, early_stopping_rounds=50):
    params = transform_params(params)
    
    clf_xgb = XGBClassifier(**params)
        
    dtrain = DMatrix(x_train, label=y_train)
    xgb_param = clf_xgb.get_xgb_params()
    
    cvresult = xgb.cv(xgb_param, dtrain, num_boost_round=clf_xgb.get_xgb_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
    clf_xgb.set_params(n_estimators=cvresult.shape[0])
    
    return clf_xgb.get_xgb_params()


In [None]:
params = find_n_estimators(current_space)
current_space["n_estimators"] = params["n_estimators"]

In [None]:
test_parameters(current_space)

In [None]:
current_space

# Tune

In [None]:
def objective(params):
    params = transform_params(params)

    clf_xgb = XGBClassifier(**params)

    eval_set  = [( x_train, y_train), (x_val, y_val)]

    clf_xgb.fit(x_train, y_train,
            eval_set=eval_set, eval_metric="auc", 
            early_stopping_rounds=50, verbose=False)

    pred = clf_xgb.predict_proba(x_val)[:,1]
    auc = roc_auc_score(y_val, pred)
#     print("SCORE:", auc)

    return{'loss':1-auc, 'status': STATUS_OK }

## Tune max_depth and min_child_weight

In [None]:
space ={
        'max_depth': hp.quniform("max_depth", 1, 30, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        }

In [None]:
current_space["max_depth"] = space["max_depth"]
current_space["min_child_weight"] = space["min_child_weight"]

In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=current_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

In [None]:
current_space["max_depth"] = best["max_depth"]
current_space["min_child_weight"] = best["min_child_weight"]
test_parameters(current_space)

In [None]:
current_space

## Tune gamma

In [None]:
space ={
    'gamma': hp.quniform('gamma', 0.0, 1, 0.05)
        }

In [None]:
current_space["gamma"] = space["gamma"]

In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=current_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

In [None]:
current_space["gamma"] = best["gamma"]
test_parameters(current_space)

In [None]:
current_space

## Tune subsample and colsample_bytree

In [None]:
space ={
        'subsample': hp.uniform ('subsample', 0.7, 1),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 1, 0.05)
        }

In [None]:
current_space["subsample"] = space["subsample"]
current_space["colsample_bytree"] = space["colsample_bytree"]

In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=current_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

In [None]:
current_space["subsample"] = best["subsample"]
current_space["colsample_bytree"] = best["colsample_bytree"]
test_parameters(current_space)

In [None]:
current_space

## Tuning Regularization Parameters

In [None]:
space ={
        'reg_lambda' :  hp.quniform('reg_lambda', 0, 10, 0.5)
        }

In [None]:
current_space["reg_lambda"] = space["reg_lambda"]

In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=current_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

In [None]:
current_space["reg_lambda"] = best["reg_lambda"]
test_parameters(current_space)

In [None]:
current_space

## Tune lr and n_estimators

In [None]:
space = {
    'learning_rate': hp.quniform('learning_rate', 0.001, current_space["learning_rate"], 0.002),
    "n_estimators": hp.quniform("n_estimators", current_space["n_estimators"], 5000, 20),
}

In [None]:
current_space["learning_rate"] = space["learning_rate"]
current_space["n_estimators"] = space["n_estimators"]

In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=current_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

In [None]:
current_space["learning_rate"] = best["learning_rate"]
current_space["n_estimators"] = best["n_estimators"]
test_parameters(current_space)

In [None]:
current_space["learning_rate"] = best["learning_rate"]
current_space["n_estimators"] = best["n_estimators"]
test_parameters(current_space)

In [None]:
current_space["learning_rate"] = best["learning_rate"]
current_space["n_estimators"] = best["n_estimators"]
test_parameters(current_space)

In [None]:
current_space

# Basic

Formulating an optimization problem in Hyperopt requires four parts:<br>
> Objective Function: takes in an input and returns a loss to minimize <br>
> Domain space: the range of input values to evaluate<br>
> Optimization Algorithm: the method used to construct the surrogate function and choose the next values to evaluate<br>
> Results: score, value pairs that the algorithm uses to build the model<br>

In [None]:
domain_space ={
        "booster": 'gbtree',       
        "objective": 'binary:logistic',
        "eval_metric": 'auc',
        "nthread": -1,        
        'max_depth': hp.quniform("max_depth", 1, 30, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'gamma': hp.quniform('gamma', 0.0, 1, 0.05),
        'subsample': hp.uniform ('subsample', 0.7, 1),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 1, 0.05),
        'reg_lambda' :  hp.quniform('reg_lambda', 0, 10, 0.5),
        'learning_rate': hp.quniform('learning_rate', 0.001, 0.5, 0.002),
        "n_estimators": hp.quniform("n_estimators", 10, 1000, 20),
        "scale_pos_weight": hp.quniform('scale_pos_weight', 0.0, 1, 0.05),
    
    }

In [None]:
def objective(params):
    space = transform_params(params)

    clf_xgb = XGBClassifier(**params)

    eval_set  = [( x_train, y_train), (x_val, y_val)]

    clf_xgb.fit(x_train, y_train,
            eval_set=eval_set, eval_metric="auc", 
            early_stopping_rounds=50, verbose=False)

    pred = clf_xgb.predict_proba(x_val)[:,1]
    auc = roc_auc_score(y_val, pred)
#     print("SCORE:", auc)

    return{'loss':1-auc, 'status': STATUS_OK }

In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=domain_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

In [None]:
best["booster"] = domain_space["booster"]
best["objective"] = domain_space["objective"]
best["eval_metric"] = domain_space["eval_metric"]
best["nthread"] = domain_space["nthread"]
test_parameters(best)

In [None]:
best

# Memory efficient approach

In [None]:
domain_space ={
        "booster": 'gbtree',       
        "objective": 'binary:logistic',
        "eval_metric": 'auc',
        "nthread": -1,        
        'max_depth': hp.quniform("max_depth", 1, 30, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'gamma': hp.quniform('gamma', 0.0, 1, 0.05),
        'subsample': hp.uniform ('subsample', 0.7, 1),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 1, 0.05),
        'lambda' :  hp.quniform('lambda', 0, 10, 0.5),
        'eta': hp.quniform('eta', 0.001, 0.5, 0.002),
        "n_estimators": hp.quniform("n_estimators", 10, 1000, 20),
        "scale_pos_weight": hp.quniform('scale_pos_weight', 0.0, 1, 0.05),
    
    }

In [None]:
def objective_me(space):
    space = transform_params(space)
    dtrain = DMatrix(x_train, label=y_train)
    dval = DMatrix(x_val, label=y_val)
    evallist  = [(dtrain,'train'),(dval,'eval')]
    clf = xgb.train(params=space, dtrain=dtrain, evals=evallist, early_stopping_rounds=50, verbose_eval=False)
    
    pred = clf.predict(dval)
    auc = roc_auc_score(y_val, pred)

    
    return{'loss':1-auc, 'status': STATUS_OK }

In [None]:
trials = Trials()
best = fmin(fn=objective_me,
            space=domain_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

In [None]:
best["booster"] = domain_space["booster"]
best["objective"] = domain_space["objective"]
best["eval_metric"] = domain_space["eval_metric"]
best["nthread"] = domain_space["nthread"]
test_parameters(best)

In [None]:
best

# Searching Visualizaion

In [None]:
tpe_results = pd.DataFrame({'loss': [x['loss'] for x in trials.results], 'iteration': trials.idxs_vals[0]['max_depth'],
                           **trials.idxs_vals[1]})
                            
tpe_results.head()

In [None]:
color = dict(zip(tpe_results.columns[2:], ["green", "red", "m", "purple", "orange", "g"]))
figsize=(10,10)
ax=None
for col in tpe_results.columns[2:]:
    ax = tpe_results.plot.scatter(x="iteration", y=col, grid=True, xticks=range(tpe_results.shape[0]), ax=ax, c=color[col], label=col, figsize=figsize, s=100);

In [None]:
tpe_results.iloc[:,2:].plot.hist(bins=50, figsize=(10,10), edgecolor = 'k');

# Parameters:

In [None]:
space ={
        "booster"   : 'gbtree',       
        "objective"   : 'binary:logistic',
        "eval_metric" : 'auc',
        "nthread"     : -1,
        "n_estimators" : 100, 
        'max_depth': hp.quniform("max_depth", 1, 30, 1),
        'min_child_weight': hp.quniform ('min_child', 1, 10, 1),
        'subsample': hp.uniform ('subsample', 0.8, 1),
        'gamma': hp.quniform('gamma', 0.0, 1, 0.05),
        'lambda' :  hp.quniform('lambda', 0, 10, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'learning_rate': hp.quniform('learning_rate', 0.001, 0.2, 0.01),
        'reg_alpha': hp.hp.uniform ('reg_alpha', 0.0, 1.0),
        'reg_lambda': hp.hp.uniform ('reg_lambda', 0.0, 1.0)
        
    }

# Embedded clf in xgb lib

In [None]:
from hpsklearn import HyperoptEstimator, xgboost_classification

def auc(y_true, y_pred):
    return 1-roc_auc_score(y_true, y_pred)

estim = HyperoptEstimator(classifier=xxx,
                          preprocessing=[],
                          algo=tpe.suggest,
                          max_evals=10,
                          trial_timeout=300,
                          loss_fn=auc)

estim.fit(x_train, y_train)