In [None]:
!pip install autokeras

In [4]:
import tensorflow as tf
import autokeras as ak
import keras_tuner as kt
import numpy as np
import matplotlib.pyplot as plt

In [5]:
import pandas as pd
import pickle
import lightgbm as lgb
import os
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

house_dataset = fetch_california_housing()
data = pd.DataFrame(house_dataset.data, columns=house_dataset.feature_names)
target = pd.Series(house_dataset.target, name="MEDV")

X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, shuffle=False)

def build_model(hp):
    model = lgb.LGBMRegressor(
        boosting_type="gbdt",
        num_leaves=hp.Choice("num_leaves", [15, 31, 63], default=31),
        learning_rate=hp.Float("learning_rate", 1e-3, 10, sampling="log", default=0.05),
        n_estimators=hp.Int("n_estimators", 10, 200, step=10),)
    return model

class LightGBMTuner(kt.Tuner):
    def run_trial(self, trial, X, y, validation_data):
        model = self.hypermodel.build(trial.hyperparameters)
        model.fit(X_train, y_train,
                  eval_set=[validation_data],
                  eval_metric="mse")
        X_val, y_val = validation_data
        y_pred = model.predict(
            X_val, num_iteration=model.best_iteration_
        )  # evaluate the model
        eval_mse = mean_squared_error(y_val, y_pred)
        self.save_model(trial.trial_id, model)
        return {"mse": eval_mse}

    def save_model(self, trial_id, model, step=0):
        fname = os.path.join(self.get_trial_dir(trial_id), "model.txt")
        model.booster_.save_model(fname, num_iteration=model.best_iteration_)

    def load_model(self, trial):
        fname = os.path.join(self.get_trial_dir(trial.trial_id), "model.txt")
        model = lgb.Booster(model_file=fname)
        return model

```
BayesianOptimization을 사용자 정의 클래스로 구현할 수 있지만
라이브러리에 있는 알고리즘과 같아서 생략함.
만약 중간에 학습이 정지되었다면 project name을 기존과 같게 설정(유지)하고
overwrite = False로 지정하면 됨.
```

In [6]:
my_lightgbm_tuner = LightGBMTuner(
    oracle=kt.oracles.BayesianOptimizationOracle(
        objective=kt.Objective("mse", "min"), max_trials=30, seed=42),
    hypermodel=build_model,
    overwrite=True,
    project_name="my_lightgbm_tuner")

my_lightgbm_tuner.search(X_train, y_train, validation_data=(X_val, y_val))

Trial 30 Complete [00h 00m 00s]
mse: 184163391782.9315

Best mse So Far: 0.21736801528572022
Total elapsed time: 00h 00m 35s


In [7]:
from sklearn.metrics import mean_squared_error

best_model = my_lightgbm_tuner.get_best_models(1)[0]
y_pred_test = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
print("The prediction MSE on test set: {}".format(test_mse))

The prediction MSE on test set: 0.20677321538492008


```
더 나아가서.. 진화적 기법을 사용해본다.   
연속적인 정수 feature가 많은 경우에는 베이지안이 더 낫지만   
범주형 feature가 더 많다면 진화적 기법을 고려해볼만 하다.   
추가적으로 진화적 기법에서 돌연변이는 학습률같은 하이퍼파라미터를 탐색할떄
좋은 선택이라고 할 수 있는데 이를 개선하기 위해서 2가지 방법을 사용해볼 수 있다.
1. 로그 스케일 적용
2. 돌연변이 가이드에 대리 모델을 추가해 진화적 기법과 모델 기반 기법을 결합
   -> 돌연변이를 여러 번 임의로 수행하고 대리 모델로 가장 좋은 시도를 자손으로
      선택하는 기법
```

In [11]:
import os
import pickle
import tensorflow as tf
import keras_tuner as kt
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


class LightGBMTuner(kt.engine.base_tuner.BaseTuner):
    def run_trial(self, trial, X, y, validation_data):
        model = self.hypermodel.build(trial.hyperparameters)  # build the model
        model.fit(
            X_train,
            y_train,
            eval_set=[validation_data],
            eval_metric="mse",
            #early_stopping_rounds=5,
        )  # fit the model
        X_val, y_val = validation_data
        y_pred = model.predict(
            X_val, num_iteration=model.best_iteration_
        )  # evaluate the model
        eval_mse = mean_squared_error(y_val, y_pred)
        self.save_model(trial.trial_id, model)  # save the model to disk
        # inform the oracle of the eval result, the result is a dictionary with the metric names as the keys.
        return {"mse": eval_mse}

    def save_model(self, trial_id, model, step=0):
        fname = os.path.join(self.get_trial_dir(trial_id), "model.txt")
        model.booster_.save_model(fname, num_iteration=model.best_iteration_)

    def load_model(self, trial):
        fname = os.path.join(self.get_trial_dir(trial.trial_id), "model.txt")
        model = lgb.Booster(model_file=fname)
        return model

In [9]:
import random
import numpy as np
from keras_tuner.engine import hyperparameters as hp_module
from keras_tuner.engine import oracle as oracle_module
from keras_tuner.engine import trial as trial_lib


class EvolutionaryOracle(oracle_module.Oracle):
    """Evolutionary search oracle.

        It uses aging evluation algorithm following: https://arxiv.org/pdf/1802.01548.pdf.
        # Arguments
            objective: String or `kerastuner.Objective`. If a string,
              the direction of the optimization (min or max) will be
              inferred.
            max_trials: Int. Total number of trials
                (model configurations) to test at most.
                Note that the oracle may interrupt the search
                before `max_trial` models have been tested if the search space has been
                exhausted.
            num_initial_points: (Optional) Int. The number of randomly generated samples
                as initial training data for Evolutionary search. If not specified,
                a value of 3 times the dimensionality of the hyperparameter space is
                used.
            population_size: (Optional) Int. The number of trials to form the populations.
    candidate_size: (Optional) Int. The number of candidate trials in the tournament
    selection.
            seed: Int. Random seed.
            hyperparameters: HyperParameters class instance.
                Can be used to override (or register in advance)
                hyperparamters in the search space.
    """

    def __init__(
        self,
        objective,
        max_trials,
        num_initial_points=None,
        population_size=None,
        candidate_size=None,
        seed=None,
        hyperparameters=None,
        *args,
        **kwargs
    ):
        super(EvolutionaryOracle, self).__init__(
            objective=objective,
            max_trials=max_trials,
            hyperparameters=hyperparameters,
            seed=seed,
            *args,
            **kwargs
        )
        self.population_size = population_size or 20
        self.candidate_size = candidate_size or 5
        self.num_initial_points = num_initial_points or self.population_size
        self.num_initial_points = max(self.num_initial_points, population_size)
        self.population_trial_ids = []
        self.seed = seed or random.randint(1, 1e4)
        self._seed_state = self.seed
        self._random_state = np.random.RandomState(self.seed)
        self._max_collisions = 100

    def _random_populate_space(self):
        values = self._random_values()
        if values is None:
            return {"status": trial_lib.TrialStatus.STOPPED, "values": None}
        return {"status": trial_lib.TrialStatus.RUNNING, "values": values}

    def _num_completed_trials(self):
        return len([t for t in self.trials.values() if t.status == "COMPLETED"])

    def populate_space(self, trial_id):

        if self._num_completed_trials() < self.num_initial_points:
            return self._random_populate_space()

        self.population_trial_ids = self.end_order[-self.population_size :]

        # candidate trial selection
        candidate_indices = self._random_state.choice(
            self.population_size, self.candidate_size, replace=False
        )
        self.candidate_indices = candidate_indices
        candidate_trial_ids = list(
            map(self.population_trial_ids.__getitem__, candidate_indices)
        )

        # get the best candidate based on the performance
        candidate_scores = [
            self.trials[trial_id].score for trial_id in candidate_trial_ids
        ]
        best_candidate_trial_id = candidate_trial_ids[np.argmin(candidate_scores)]
        best_candidate_trial = self.trials[best_candidate_trial_id]

        # mutate the hps of the candidate
        values = self._mutate(best_candidate_trial)

        if values is None:
            return {"status": trial_lib.TrialStatus.STOPPED, "values": None}

        return {"status": trial_lib.TrialStatus.RUNNING, "values": values}

    def _mutate(self, best_trial):

        best_hps = best_trial.hyperparameters

        # get non-fixed and active hyperparameters in the trial to be mutated
        nonfixed_active_hps = [
            hp
            for hp in self.hyperparameters.space
            if not isinstance(hp, hp_module.Fixed) and best_hps.is_active(hp)
        ]

        # random select a hyperparameter to mutate
        hp_to_mutate = self._random_state.choice(nonfixed_active_hps, 1)[0]

        collisions = 0
        while True:
            hps = hp_module.HyperParameters()
            # Generate a set of random values.
            for hp in self.hyperparameters.space:
                hps.merge([hp])
                # if not active, do nothing.
                # if active, check if selected to be changed.
                if hps.is_active(hp):
                    # if was active and not selected, do nothing.
                    if best_hps.is_active(hp.name) and hp.name != hp_to_mutate.name:
                        hps.values[hp.name] = best_hps.values[hp.name]
                        continue
                    # if was not active or selected, sample.
                    hps.values[hp.name] = hp.random_sample(self._seed_state)
                    self._seed_state += 1
            values = hps.values

            # Make sure the new hyperparameters has not been evaluated before
            # Keep trying until the set of values is unique,
            # or until we exit due to too many collisions.
            values_hash = self._compute_values_hash(values)
            if values_hash in self._tried_so_far:
                collisions += 1
                if collisions <= self._max_collisions:
                    continue
                return None
            self._tried_so_far.add(values_hash)
            break
        return values

    def get_state(self):
        state = super(EvolutionaryOracle, self).get_state()
        state.update(
            {
                "num_initial_points": self.num_initial_points,
                "population_size": self.population_size,
                "candidate_size": self.candidate_size,
                "seed": self.seed,
                "_max_collisions": self._max_collisions,
            }
        )
        return state

    def set_state(self, state):
        super(EvolutionaryOracle, self).set_state(state)
        self.num_initial_points = state["num_initial_points"]
        self.population_size = state["population_size"]
        self.candidate_size = state["candidate_size"]
        self.population_trial_ids = self.end_order[-self.population_size :]
        self.seed = state["seed"]
        self._random_state = np.random.RandomState(self.seed)
        self._seed_state = self.seed
        self._max_collisions = state["max_collisions"]

In [12]:
evo_tuner_p20c5 = LightGBMTuner(
    oracle=EvolutionaryOracle(
        objective=kt.Objective("mse", "min"),
        max_trials=100,
        population_size=20,
        candidate_size=5,
        seed=42,
    ),
    hypermodel=build_model,
    overwrite=True,
    project_name="evo_tuner_p20c5",
)

evo_tuner_p20c5.search(X_train, y_train, validation_data=(X_val, y_val))

Trial 45 Complete [00h 00m 01s]
mse: 1.0202325236736902

Best mse So Far: 0.21464431132198714
Total elapsed time: 00h 00m 18s


In [13]:
from sklearn.metrics import mean_squared_error

best_model = evo_tuner_p20c5.get_best_models(1)[0]
y_pred_test = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
print("The prediction MSE on test set: {}".format(test_mse))

The prediction MSE on test set: 0.20921098098501498
