<a href="https://colab.research.google.com/github/AlkaidCheng/example/blob/master/LightGBMHParamTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install  --upgrade bayesian-optimization
! pip install ray[tune]

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/b5/26/9842333adbb8f17bcb3d699400a8b1ccde0af0b6de8d07224e183728acdf/bayesian_optimization-1.1.0-py3-none-any.whl
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.1.0
Collecting ray[tune]
[?25l  Downloading https://files.pythonhosted.org/packages/a8/47/7bc688d2c06c1d0fbd388b4e2725028b2792e1f652a28b848462a724c972/ray-0.8.2-cp36-cp36m-manylinux1_x86_64.whl (19.1MB)
[K     |████████████████████████████████| 19.1MB 1.2MB/s 
Collecting py-spy>=0.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/8e/a7/ab45c9ee3c4654edda3efbd6b8e2fa4962226718a7e3e3be6e3926bf3617/py_spy-0.3.3-py2.py3-none-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 46.1MB/s 
[?25hCollecting aiohttp
[?25l  Downloading https://files.pythonhosted.org/packages/7c/39/7eb5f98d24904e0f6d3edb505d4aa60e3ef83c0a58d6fe18244a51757247/aiohttp-3.6.2-cp36-

In [2]:
!wget https://gitlab.cern.ch/clcheng/mlhep-googlesummerofcode/raw/master/Prerequisite/MachineLearning/QIS_EXAM_200Events.npz

--2020-03-08 04:05:19--  https://gitlab.cern.ch/clcheng/mlhep-googlesummerofcode/raw/master/Prerequisite/MachineLearning/QIS_EXAM_200Events.npz
Resolving gitlab.cern.ch (gitlab.cern.ch)... 188.184.30.115, 188.184.84.41, 188.184.30.144, ...
Connecting to gitlab.cern.ch (gitlab.cern.ch)|188.184.30.115|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9140 (8.9K) [application/zip]
Saving to: ‘QIS_EXAM_200Events.npz’


2020-03-08 04:05:20 (249 MB/s) - ‘QIS_EXAM_200Events.npz’ saved [9140/9140]



In [0]:
import numpy as np

In [0]:
def shuffle_zippedarrays(arrays):
  shape = arrays[0].shape
  assert all(shape == arrays[0].shape for arr in arrays)
  index = np.random.permutation(shape[0])
  return [arr[index] for arr in arrays]

def shuffle_data(input, label):
  output = shuffle_zippedarrays([input,label])
  return output[0], output[1]

def load_data(arrays, shuffle = True):
  data = {}
  for key in arrays:
    input, label = np.array([]), np.array([])
    for klabel in arrays[key].item():
      _input = arrays[key].item()[klabel]
      _label = np.full((_input.shape[0],),int(klabel))
      input = np.concatenate((input,_input),axis=0) if input.size else _input
      label = np.concatenate((label,_label),axis=0) if label.size else _label
    if shuffle:
      input, label = shuffle_data(input, label)
    data[key] = {'input': input, 'label': label}
  return data
def load_train_test_input_labels(arrays, shuffle = True):
  data = load_data(arrays, shuffle)
  return data['training_input']['input'], data['test_input']['input'], data['training_input']['label'], data['test_input']['label']

In [0]:
data = np.load('QIS_EXAM_200Events.npz',allow_pickle=True)
train_input, test_input, train_label, test_label = load_train_test_input_labels(data)

In [12]:
import lightgbm as lgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import AsyncHyperBandScheduler
import ray
from hyperopt import hp
from ray import tune


def LightGBMCallback(env):
    _, metric, score, _ = env.evaluation_result_list[0]
    tune.track.log(**{metric: score})

def train_breast_cancer(params):
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x = train_input
    test_x = test_input
    train_y = train_label
    test_y = test_label
    train_set = lgb.Dataset(train_x, label=train_y)
    test_set = lgb.Dataset(test_x, label=test_y)
    # Retrieve the subsample if present otherwise set to 1.0
    subsample = params['boosting_type'].get('subsample', 1.0)
    
    # Extract the boosting type
    params['boosting_type'] = params['boosting_type']['boosting_type']
    params['subsample'] = subsample    

    # Make sure parameters that need to be integers are integers
    for parameter_name in ['num_leaves']:
        params[parameter_name] = int(params[parameter_name])      
    gbm = lgb.train(
        params,
        train_set,
        valid_sets=[test_set],
        verbose_eval=False,
        callbacks=[LightGBMCallback])
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    tune.track.log(
        mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels),
        done=True)
    
ray.shutdown()
ray.init(webui_host='127.0.0.1')

num_threads = 2

space = {
    'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                                 {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                                 {'boosting_type': 'goss', 'subsample': 1.0}]),
    'num_leaves': hp.quniform('num_leaves', 10, 1000, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.000001), np.log(0.2)),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.5, 1.0),
    "objective" : "binary",
    "verbose" : -1,
    "metric" : "binary_logloss"
}

algo = HyperOptSearch(
        space,
        max_concurrent=4,
        metric="mean_accuracy",
        mode="min")
from ray.tune.schedulers import ASHAScheduler
! rm -r /lightgbm_result
analysis = tune.run(
    train_breast_cancer,
    num_samples=100,
    verbose =  0,
    resources_per_trial={'gpu': 1},
    search_alg = algo,
    local_dir = '/lightgbm_result',
    scheduler=AsyncHyperBandScheduler(metric="mean_accuracy", mode="min"))
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

2020-03-08 05:11:42,021	INFO resource_spec.py:212 -- Starting Ray with 6.59 GiB memory available for workers and up to 3.3 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-08 05:11:42,315	INFO services.py:1078 -- View the Ray dashboard at [1m[32m127.0.0.1:8265[39m[22m
2020-03-08 05:11:44,135	INFO function_runner.py:250 -- tune.track signature detected.
2020-03-08 05:14:20,005	INFO tune.py:352 -- Returning an analysis object by default. You can call `analysis.trials` to retrieve a list of trials. This message will be removed in future versions of Tune.


Best config:  {'boosting_type': {'boosting_type': 'gbdt', 'subsample': 0.8069159416588685}, 'colsample_bytree': 0.8186783735937218, 'learning_rate': 0.04669483279255087, 'metric': 'binary_logloss', 'num_leaves': 714.0, 'objective': 'binary', 'reg_alpha': 0.6570397599979627, 'reg_lambda': 0.03784950639114426, 'verbose': -1}
