<a href="https://colab.research.google.com/github/AlkaidCheng/example/blob/master/XGBoostHParamTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install  --upgrade bayesian-optimization
! pip install ray[tune]

Requirement already up-to-date: bayesian-optimization in /usr/local/lib/python3.6/dist-packages (1.1.0)


In [2]:
!wget https://gitlab.cern.ch/clcheng/mlhep-googlesummerofcode/raw/master/Prerequisite/MachineLearning/QIS_EXAM_200Events.npz

--2020-03-08 02:54:15--  https://gitlab.cern.ch/clcheng/mlhep-googlesummerofcode/raw/master/Prerequisite/MachineLearning/QIS_EXAM_200Events.npz
Resolving gitlab.cern.ch (gitlab.cern.ch)... 188.184.116.40, 188.184.30.115, 188.184.104.112, ...
Connecting to gitlab.cern.ch (gitlab.cern.ch)|188.184.116.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9140 (8.9K) [application/zip]
Saving to: ‘QIS_EXAM_200Events.npz.2’


2020-03-08 02:54:17 (339 MB/s) - ‘QIS_EXAM_200Events.npz.2’ saved [9140/9140]



In [0]:
import numpy as np

In [0]:
def shuffle_zippedarrays(arrays):
  shape = arrays[0].shape
  assert all(shape == arrays[0].shape for arr in arrays)
  index = np.random.permutation(shape[0])
  return [arr[index] for arr in arrays]

def shuffle_data(input, label):
  output = shuffle_zippedarrays([input,label])
  return output[0], output[1]

def load_data(arrays, shuffle = True):
  data = {}
  for key in arrays:
    input, label = np.array([]), np.array([])
    for klabel in arrays[key].item():
      _input = arrays[key].item()[klabel]
      _label = np.full((_input.shape[0],),int(klabel))
      input = np.concatenate((input,_input),axis=0) if input.size else _input
      label = np.concatenate((label,_label),axis=0) if label.size else _label
    if shuffle:
      input, label = shuffle_data(input, label)
    data[key] = {'input': input, 'label': label}
  return data
def load_train_test_input_labels(arrays, shuffle = True):
  data = load_data(arrays, shuffle)
  return data['training_input']['input'], data['test_input']['input'], data['training_input']['label'], data['test_input']['label']

In [0]:
data = np.load('QIS_EXAM_200Events.npz',allow_pickle=True)
train_input, test_input, train_label, test_label = load_train_test_input_labels(data)

In [8]:
import xgboost as xgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import AsyncHyperBandScheduler
import ray
from hyperopt import hp
from ray import tune


def XGBCallback(env):
    tune.track.log(**dict(env.evaluation_result_list))


def train_breast_cancer(config):
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x = train_input
    test_x = test_input
    train_y = train_label
    test_y = test_label
    train_set = xgb.DMatrix(train_x, label=train_y)
    test_set = xgb.DMatrix(test_x, label=test_y)
    bst = xgb.train(
        config, train_set, evals=[(test_set, "eval")], callbacks=[XGBCallback])
    preds = bst.predict(test_set)
    pred_labels = np.rint(preds)
    tune.track.log(
        mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels),
        done=True)

ray.shutdown()
ray.init(webui_host='127.0.0.1')

num_threads = 2

space = {
    "verbosity" : 0,
    "max_depth": hp.choice("max_depth", range(3, 26, 1)),
    "num_threads": 4,
    "objective" : "binary:logistic",
    "booster": "gbtree",
    "eval_metric": "logloss",
    "colsample_bytree" : hp.uniform("colsample_bytree",0.5,1),
    "min_child_weight" : hp.quniform("min_child_weight", 1, 11, 1),
    "subsample" : hp.uniform("subsample", 0.5, 1),
    "eta" :  hp.uniform("eta", 0.01, 0.2),
    "tree_method" : 'gpu_hist',
    "gamma" : hp.uniform("gamma", 0, 10),
    "grow_policy": "lossguide"}
algo = HyperOptSearch(
        space,
        max_concurrent=4,
        metric="mean_accuracy",
        mode="min")
from ray.tune.schedulers import ASHAScheduler
! rm -r /xgboost_result
analysis = tune.run(
    train_breast_cancer,
    num_samples=100,
    verbose =  0,
    resources_per_trial={'gpu': 1},
    search_alg = algo,
    local_dir = '/xgboost_result',
    scheduler=AsyncHyperBandScheduler(metric="mean_accuracy", mode="min"))
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

2020-03-08 02:58:13,818	INFO resource_spec.py:212 -- Starting Ray with 6.59 GiB memory available for workers and up to 3.3 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-08 02:58:14,122	INFO services.py:1078 -- View the Ray dashboard at [1m[32m127.0.0.1:8265[39m[22m
2020-03-08 02:58:18,366	INFO function_runner.py:250 -- tune.track signature detected.


[2m[36m(pid=14107)[0m tcmalloc: large alloc 1073741824 bytes == 0x30bd8000 @  0x7f3ba930e887 0x7f3aaa371dff 0x7f3aaa37cfe0 0x7f3aaa37d447 0x7f3aa9e17ecf 0x7f3aaa374289 0x7f3aaa37524d 0x7f3aaa1bf791 0x7f3aaa1c0c95 0x7f3aaa1d3556 0x7f3aaa136aa5 0x7f3ba6d3fdae 0x7f3ba6d3f71f 0x7f3ba6f535c4 0x7f3ba6f53c33 0x5aa6ec 0x50abb3 0x50c5b9 0x508245 0x50a080 0x50aa7d 0x50c5b9 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x50a080 0x50aa7d 0x50d390 0x509d48
[2m[36m(pid=14107)[0m [0]	eval-logloss:0.677586
[2m[36m(pid=14107)[0m [1]	eval-logloss:0.658818
[2m[36m(pid=14107)[0m [2]	eval-logloss:0.645018
[2m[36m(pid=14107)[0m [3]	eval-logloss:0.638047
[2m[36m(pid=14107)[0m [4]	eval-logloss:0.62439
[2m[36m(pid=14107)[0m [5]	eval-logloss:0.615481
[2m[36m(pid=14107)[0m [6]	eval-logloss:0.606897
[2m[36m(pid=14107)[0m [7]	eval-logloss:0.600198
[2m[36m(pid=14107)[0m [8]	eval-logloss:0.598537
[2m[36m(pid=14107)[0m [9]	eval-logloss:0.590687
[2m[36m(pid=14106)[0m [0]	eval-logloss



[2m[36m(pid=14175)[0m [0]	eval-logloss:0.676414
[2m[36m(pid=14175)[0m [1]	eval-logloss:0.661563
[2m[36m(pid=14175)[0m [2]	eval-logloss:0.656147
[2m[36m(pid=14175)[0m [3]	eval-logloss:0.636826
[2m[36m(pid=14175)[0m [4]	eval-logloss:0.633279
[2m[36m(pid=14175)[0m [5]	eval-logloss:0.630422
[2m[36m(pid=14175)[0m [6]	eval-logloss:0.620633
[2m[36m(pid=14175)[0m [7]	eval-logloss:0.616597
[2m[36m(pid=14175)[0m [8]	eval-logloss:0.617185
[2m[36m(pid=14175)[0m [9]	eval-logloss:0.610813
[2m[36m(pid=14191)[0m [0]	eval-logloss:0.690594
[2m[36m(pid=14191)[0m [1]	eval-logloss:0.678124
[2m[36m(pid=14191)[0m [2]	eval-logloss:0.674825
[2m[36m(pid=14191)[0m [3]	eval-logloss:0.672541
[2m[36m(pid=14191)[0m [4]	eval-logloss:0.66007
[2m[36m(pid=14191)[0m [5]	eval-logloss:0.660066
[2m[36m(pid=14191)[0m [6]	eval-logloss:0.658189
[2m[36m(pid=14191)[0m [7]	eval-logloss:0.658185
[2m[36m(pid=14191)[0m [8]	eval-logloss:0.65457
[2m[36m(pid=14191)[0m [9]	eval



[2m[36m(pid=14982)[0m [0]	eval-logloss:0.65987
[2m[36m(pid=14982)[0m [1]	eval-logloss:0.633185
[2m[36m(pid=14982)[0m [2]	eval-logloss:0.624547
[2m[36m(pid=14982)[0m [3]	eval-logloss:0.60956
[2m[36m(pid=14982)[0m [4]	eval-logloss:0.593243
[2m[36m(pid=14982)[0m [5]	eval-logloss:0.587285
[2m[36m(pid=14982)[0m [6]	eval-logloss:0.579297
[2m[36m(pid=14982)[0m [7]	eval-logloss:0.578659
[2m[36m(pid=14982)[0m [8]	eval-logloss:0.573602
[2m[36m(pid=14982)[0m [9]	eval-logloss:0.566777
[2m[36m(pid=14998)[0m [0]	eval-logloss:0.66487
[2m[36m(pid=14998)[0m [1]	eval-logloss:0.636962
[2m[36m(pid=14998)[0m [2]	eval-logloss:0.622317
[2m[36m(pid=14998)[0m [3]	eval-logloss:0.591602
[2m[36m(pid=14998)[0m [4]	eval-logloss:0.581471
[2m[36m(pid=14998)[0m [5]	eval-logloss:0.585839
[2m[36m(pid=14998)[0m [6]	eval-logloss:0.570758
[2m[36m(pid=14998)[0m [7]	eval-logloss:0.573738
[2m[36m(pid=14998)[0m [8]	eval-logloss:0.568155
[2m[36m(pid=14998)[0m [9]	eval-



[2m[36m(pid=15364)[0m [0]	eval-logloss:0.668903
[2m[36m(pid=15364)[0m [1]	eval-logloss:0.649648
[2m[36m(pid=15364)[0m [2]	eval-logloss:0.644995
[2m[36m(pid=15364)[0m [3]	eval-logloss:0.626034
[2m[36m(pid=15364)[0m [4]	eval-logloss:0.612635
[2m[36m(pid=15364)[0m [5]	eval-logloss:0.608351
[2m[36m(pid=15364)[0m [6]	eval-logloss:0.598329
[2m[36m(pid=15364)[0m [7]	eval-logloss:0.592802
[2m[36m(pid=15364)[0m [8]	eval-logloss:0.585287
[2m[36m(pid=15364)[0m [9]	eval-logloss:0.580409
[2m[36m(pid=15383)[0m [0]	eval-logloss:0.676915
[2m[36m(pid=15383)[0m [1]	eval-logloss:0.661545
[2m[36m(pid=15383)[0m [2]	eval-logloss:0.651575
[2m[36m(pid=15383)[0m [3]	eval-logloss:0.638827
[2m[36m(pid=15383)[0m [4]	eval-logloss:0.628072
[2m[36m(pid=15383)[0m [5]	eval-logloss:0.623133
[2m[36m(pid=15383)[0m [6]	eval-logloss:0.615102
[2m[36m(pid=15383)[0m [7]	eval-logloss:0.610921
[2m[36m(pid=15383)[0m [8]	eval-logloss:0.605234
[2m[36m(pid=15383)[0m [9]	ev



[2m[36m(pid=15783)[0m [9]	eval-logloss:0.567842
[2m[36m(pid=15799)[0m [0]	eval-logloss:0.667532
[2m[36m(pid=15799)[0m [1]	eval-logloss:0.640027
[2m[36m(pid=15799)[0m [2]	eval-logloss:0.629135
[2m[36m(pid=15799)[0m [3]	eval-logloss:0.617145
[2m[36m(pid=15799)[0m [4]	eval-logloss:0.602354
[2m[36m(pid=15799)[0m [5]	eval-logloss:0.598412
[2m[36m(pid=15799)[0m [6]	eval-logloss:0.592764
[2m[36m(pid=15799)[0m [7]	eval-logloss:0.59129
[2m[36m(pid=15799)[0m [8]	eval-logloss:0.590106
[2m[36m(pid=15799)[0m [9]	eval-logloss:0.582892


2020-03-08 03:01:31,969	INFO tune.py:352 -- Returning an analysis object by default. You can call `analysis.trials` to retrieve a list of trials. This message will be removed in future versions of Tune.


Best config:  {'booster': 'gbtree', 'colsample_bytree': 0.9480213147848217, 'eta': 0.09609772163997372, 'eval_metric': 'logloss', 'gamma': 4.379610693019819, 'grow_policy': 'lossguide', 'max_depth': 6, 'min_child_weight': 7.0, 'num_threads': 4, 'objective': 'binary:logistic', 'subsample': 0.5778642303546857, 'tree_method': 'gpu_hist', 'verbosity': 0}


In [11]:
np.random.randint(1,11, size=200)

array([ 9,  9,  9, 10,  6,  6,  2,  9,  3,  4,  6,  1,  9,  6,  9,  3, 10,
        6,  8,  2,  4,  9,  5, 10,  4,  7,  6,  8,  8,  1, 10,  2,  8,  4,
        8,  3,  7,  7,  1,  6,  6,  7,  9,  1, 10,  2,  3, 10,  1,  4,  3,
        9,  4,  2,  3, 10,  6, 10,  2,  4,  8,  3,  3,  3,  9,  5,  1,  8,
        9,  7,  5,  5,  3,  8,  1,  1,  8,  4,  1,  9,  8,  3,  5,  1,  1,
        5,  6,  7,  1,  2,  2,  1,  5,  2,  5,  9,  7,  8,  7,  3,  9,  3,
        2,  2, 10,  3,  7,  3,  9,  3,  5,  2,  4,  6, 10,  4,  6, 10,  7,
       10,  6, 10,  9,  2,  5,  1,  6,  4,  7, 10,  3,  9,  7,  7,  8,  5,
       10,  2,  1,  7,  6,  1,  9,  7,  9, 10,  7, 10,  1,  9,  7,  5,  8,
        9,  2,  8,  5,  1,  3,  9,  3,  6,  3,  5,  7,  5,  9,  6,  1, 10,
        6,  4,  5, 10,  5,  5, 10,  4, 10,  3,  8,  7,  8,  2,  6,  9,  2,
        3,  8,  4,  9,  3,  8,  3,  7, 10,  6, 10,  5,  1])