In [12]:
import numpy as np
import pandas as pd
import time
import os 

#算法/损失/评估指标等
import sklearn
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import KFold, cross_validate

#优化器
from bayes_opt import BayesianOptimization

import hyperopt
from hyperopt import hp, fmin, tpe, Trials, partial
from hyperopt.early_stop import no_progress_loss

import optuna

In [13]:
data = pd.read_csv("E:/机器学习_冲刺班/Lesson 09.随机森林模型/datasets/House Price/train_encode.csv")

X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [14]:
def bayesopt_objective(n_estimators,max_depth,max_features,min_impurity_decrease):
    
    #定义评估器
    #需要调整的超参数等于目标函数的输入，不需要调整的超参数则直接等于固定值
    #默认参数输入一定是浮点数，因此需要套上int函数处理成整数
    reg = RFR(n_estimators = int(n_estimators)
              ,max_depth = int(max_depth)
              ,max_features = int(max_features)
              ,min_impurity_decrease = min_impurity_decrease
              ,random_state=1412
              ,verbose=False #可自行决定是否开启森林建树的verbose
              ,n_jobs=-1)
    
    #定义损失的输出，5折交叉验证下的结果，输出负根均方误差（-RMSE）
    #注意，交叉验证需要使用数据，但我们不能让数据X,y成为目标函数的输入
    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    validation_loss = cross_validate(reg,X,y
                                     ,scoring="neg_root_mean_squared_error"
                                     ,cv=cv
                                     ,verbose=False
                                     ,n_jobs=-1
                                     ,error_score='raise'
                                     #如果交叉验证中的算法执行报错，则告诉我们错误的理由
                                    )
    
    #交叉验证输出的评估指标是负根均方误差，因此本来就是负的损失
    #目标函数可直接输出该损失的均值
    return np.mean(validation_loss["test_score"])

In [15]:
param_grid_simple = {'n_estimators': (80,100)
                     , 'max_depth':(10,25)
                     , "max_features": (10,20)
                     , "min_impurity_decrease":(0,1)
                    }

In [16]:
def param_bayes_opt(init_points,n_iter):
    
    #定义优化器，先实例化优化器
    opt = BayesianOptimization(bayesopt_objective #需要优化的目标函数
                               ,param_grid_simple #备选参数空间
                               ,random_state=1412 #随机数种子，虽然无法控制住
                              )
    
    #使用优化器，记住bayes_opt只支持最大化
    opt.maximize(init_points = init_points #抽取多少个初始观测值
                 , n_iter=n_iter #一共观测/迭代多少次
                )
    
    #优化完成，取出最佳参数与最佳分数
    params_best = opt.max["params"]
    score_best = opt.max["target"]
    
    #打印最佳参数与最佳分数
    print("\n","\n","best params: ", params_best,
          "\n","\n","best cvscore: ", score_best)
    
    #返回最佳参数与最佳分数
    return params_best, score_best

In [17]:
def bayes_opt_validation(params_best):
    
    reg = RFR(n_estimators = int(params_best["n_estimators"]) 
              ,max_depth = int(params_best["max_depth"])
              ,max_features = int(params_best["max_features"])
              ,min_impurity_decrease = params_best["min_impurity_decrease"]
              ,random_state=1412
              ,verbose=False
              ,n_jobs=-1)

    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    validation_loss = cross_validate(reg,X,y
                                     ,scoring="neg_root_mean_squared_error"
                                     ,cv=cv
                                     ,verbose=False
                                     ,n_jobs=-1
                                    )
    return np.mean(validation_loss["test_score"])

In [18]:
start = time.time()
params_best, score_best = param_bayes_opt(20,280) #初始看20个观测值，后面迭代280次
print('It takes %s minutes' % ((time.time() - start)/60))
validation_score = bayes_opt_validation(params_best)
print("\n","\n","validation_score: ",validation_score)

|   iter    |  target   | max_depth | max_fe... | min_im... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-2.934e+0[0m | [0m23.2     [0m | [0m17.52    [0m | [0m0.06379  [0m | [0m88.79    [0m |
| [95m2        [0m | [95m-2.933e+0[0m | [95m14.8     [0m | [95m17.61    [0m | [95m0.9214   [0m | [95m97.58    [0m |
| [0m3        [0m | [0m-2.94e+04[0m | [0m15.86    [0m | [0m15.56    [0m | [0m0.2661   [0m | [0m87.98    [0m |
| [95m4        [0m | [95m-2.895e+0[0m | [95m14.05    [0m | [95m16.84    [0m | [95m0.06744  [0m | [95m89.72    [0m |
| [0m5        [0m | [0m-2.982e+0[0m | [0m18.71    [0m | [0m19.17    [0m | [0m0.9315   [0m | [0m83.7     [0m |
| [0m6        [0m | [0m-2.938e+0[0m | [0m17.7     [0m | [0m19.58    [0m | [0m0.7127   [0m | [0m89.18    [0m |
| [0m7        [0m | [0m-2.925e+0[0m | [0m14.21    [0m | [0m12.62    [0m | [0m0.3381   [0m | [0m91

| [0m67       [0m | [0m-2.891e+0[0m | [0m14.55    [0m | [0m16.73    [0m | [0m0.3588   [0m | [0m88.84    [0m |
| [0m68       [0m | [0m-2.942e+0[0m | [0m14.54    [0m | [0m17.6     [0m | [0m0.06964  [0m | [0m87.39    [0m |
| [0m69       [0m | [0m-2.891e+0[0m | [0m14.74    [0m | [0m16.2     [0m | [0m0.9127   [0m | [0m88.97    [0m |
| [0m70       [0m | [0m-2.891e+0[0m | [0m14.08    [0m | [0m16.45    [0m | [0m0.7028   [0m | [0m88.92    [0m |
| [0m71       [0m | [0m-2.934e+0[0m | [0m19.28    [0m | [0m17.77    [0m | [0m0.1678   [0m | [0m92.86    [0m |
| [0m72       [0m | [0m-2.891e+0[0m | [0m14.92    [0m | [0m16.61    [0m | [0m0.6595   [0m | [0m88.53    [0m |
| [0m73       [0m | [0m-2.905e+0[0m | [0m18.17    [0m | [0m17.02    [0m | [0m0.0      [0m | [0m95.05    [0m |
| [0m74       [0m | [0m-2.907e+0[0m | [0m17.9     [0m | [0m17.7     [0m | [0m0.4353   [0m | [0m97.62    [0m |
| [0m75       [0m | [

| [0m135      [0m | [0m-2.869e+0[0m | [0m10.25    [0m | [0m16.57    [0m | [0m0.1563   [0m | [0m89.93    [0m |
| [0m136      [0m | [0m-2.969e+0[0m | [0m20.93    [0m | [0m19.89    [0m | [0m0.0004996[0m | [0m91.1     [0m |
| [0m137      [0m | [0m-2.869e+0[0m | [0m10.29    [0m | [0m16.49    [0m | [0m0.3564   [0m | [0m89.61    [0m |
| [0m138      [0m | [0m-2.869e+0[0m | [0m10.03    [0m | [0m16.55    [0m | [0m0.5935   [0m | [0m89.94    [0m |
| [0m139      [0m | [0m-2.869e+0[0m | [0m10.14    [0m | [0m16.03    [0m | [0m0.2995   [0m | [0m89.78    [0m |
| [0m140      [0m | [0m-2.951e+0[0m | [0m22.91    [0m | [0m19.65    [0m | [0m0.8711   [0m | [0m97.25    [0m |
| [0m141      [0m | [0m-2.879e+0[0m | [0m11.0     [0m | [0m16.27    [0m | [0m0.5273   [0m | [0m86.79    [0m |
| [0m142      [0m | [0m-2.921e+0[0m | [0m17.9     [0m | [0m16.99    [0m | [0m0.8974   [0m | [0m83.97    [0m |
| [0m143      [0m | [

| [95m203      [0m | [95m-2.868e+0[0m | [95m10.0     [0m | [95m16.4     [0m | [95m0.4624   [0m | [95m91.07    [0m |
| [0m204      [0m | [0m-2.874e+0[0m | [0m10.16    [0m | [0m16.43    [0m | [0m0.5244   [0m | [0m87.26    [0m |
| [0m205      [0m | [0m-2.945e+0[0m | [0m11.76    [0m | [0m12.2     [0m | [0m0.7708   [0m | [0m85.51    [0m |
| [0m206      [0m | [0m-2.87e+04[0m | [0m10.4     [0m | [0m16.59    [0m | [0m0.4192   [0m | [0m90.86    [0m |
| [0m207      [0m | [0m-2.868e+0[0m | [0m10.17    [0m | [0m16.5     [0m | [0m0.0      [0m | [0m91.25    [0m |
| [0m208      [0m | [0m-2.869e+0[0m | [0m10.83    [0m | [0m16.42    [0m | [0m0.6176   [0m | [0m89.39    [0m |
| [0m209      [0m | [0m-2.953e+0[0m | [0m23.49    [0m | [0m15.7     [0m | [0m0.04256  [0m | [0m81.9     [0m |
| [0m210      [0m | [0m-2.874e+0[0m | [0m10.11    [0m | [0m16.54    [0m | [0m0.1168   [0m | [0m87.72    [0m |
| [0m211      [0

| [0m271      [0m | [0m-2.865e+0[0m | [0m10.24    [0m | [0m16.58    [0m | [0m0.044    [0m | [0m92.55    [0m |
| [0m272      [0m | [0m-2.868e+0[0m | [0m10.6     [0m | [0m16.46    [0m | [0m0.7428   [0m | [0m94.61    [0m |
| [0m273      [0m | [0m-2.938e+0[0m | [0m24.74    [0m | [0m18.81    [0m | [0m0.06543  [0m | [0m85.4     [0m |
| [0m274      [0m | [0m-2.965e+0[0m | [0m10.38    [0m | [0m15.83    [0m | [0m0.9038   [0m | [0m94.71    [0m |
| [0m275      [0m | [0m-2.957e+0[0m | [0m22.47    [0m | [0m19.82    [0m | [0m0.6159   [0m | [0m86.79    [0m |
| [0m276      [0m | [0m-2.937e+0[0m | [0m11.06    [0m | [0m16.66    [0m | [0m0.4028   [0m | [0m95.09    [0m |
| [0m277      [0m | [0m-2.946e+0[0m | [0m13.4     [0m | [0m16.5     [0m | [0m0.7104   [0m | [0m80.12    [0m |
| [0m278      [0m | [0m-2.939e+0[0m | [0m15.77    [0m | [0m12.88    [0m | [0m0.7749   [0m | [0m90.44    [0m |
| [0m279      [0m | [

In [21]:
import hyperopt
from hyperopt import hp, fmin, tpe, Trials, partial
from hyperopt.early_stop import no_progress_loss

In [22]:
def hyperopt_objective(params):
    
    reg = RFR(n_estimators = int(params["n_estimators"])
              ,max_depth = int(params["max_depth"])
              ,max_features = int(params["max_features"])
              ,min_impurity_decrease = params["min_impurity_decrease"]
              ,random_state=1412
              ,verbose=False
              ,n_jobs=-1)
    
    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    validation_loss = cross_validate(reg,X,y
                                     ,scoring="neg_root_mean_squared_error"
                                     ,cv=cv
                                     ,verbose=False
                                     ,n_jobs=-1
                                     ,error_score='raise'
                                    )
    
    return np.mean(abs(validation_loss["test_score"]))

In [23]:
param_grid_simple = {'n_estimators': hp.quniform("n_estimators",80,100,1)
                     , 'max_depth': hp.quniform("max_depth",10,25,1)
                     , "max_features": hp.quniform("max_features",10,20,1)
                     , "min_impurity_decrease":hp.quniform("min_impurity_decrease",0,5,1)
                    }

In [24]:
def param_hyperopt(max_evals=100):
    
    #保存迭代过程
    trials = Trials()
    
    #设置提前停止
    early_stop_fn = no_progress_loss(100)
    
    #定义代理模型
    #algo = partial(tpe.suggest, n_startup_jobs=20, n_EI_candidates=50)
    params_best = fmin(hyperopt_objective #目标函数
                       , space = param_grid_simple #参数空间
                       , algo = tpe.suggest #代理模型你要哪个呢？
                       #, algo = algo
                       , max_evals = max_evals #允许的迭代次数
                       , verbose=True
                       , trials = trials
                       , early_stop_fn = early_stop_fn
                      )
    
    
    print("\n","\n","best params: ", params_best,
          "\n")
    return params_best, trials

In [25]:
def hyperopt_validation(params):    
    reg = RFR(n_estimators = int(params["n_estimators"])
              ,max_depth = int(params["max_depth"])
              ,max_features = int(params["max_features"])
              ,min_impurity_decrease = params["min_impurity_decrease"]
              ,random_state=1412
              ,verbose=False
              ,n_jobs=-1
             )
    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    validation_loss = cross_validate(reg,X,y
                                     ,scoring="neg_root_mean_squared_error"
                                     ,cv=cv
                                     ,verbose=False
                                     ,n_jobs=-1
                                    )
    return np.mean(abs(validation_loss["test_score"]))

In [26]:
params_best, trials = param_hyperopt(100)

100%|██████████| 100/100 [00:55<00:00,  1.81trial/s, best loss: 28649.507326468643]

 
 best params:  {'max_depth': 10.0, 'max_features': 16.0, 'min_impurity_decrease': 2.0, 'n_estimators': 92.0} 



In [27]:
params_best, trials = param_hyperopt(300) 

 75%|███████▌  | 226/300 [02:13<00:43,  1.70trial/s, best loss: 28649.507958829832]

 
 best params:  {'max_depth': 10.0, 'max_features': 16.0, 'min_impurity_decrease': 3.0, 'n_estimators': 92.0} 



In [28]:
hyperopt_validation(params_best)

28649.507958829832

In [29]:
trials.trials[0]

{'state': 2,
 'tid': 0,
 'spec': None,
 'result': {'loss': 29484.585160647344, 'status': 'ok'},
 'misc': {'tid': 0,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'max_depth': [0],
   'max_features': [0],
   'min_impurity_decrease': [0],
   'n_estimators': [0]},
  'vals': {'max_depth': [24.0],
   'max_features': [15.0],
   'min_impurity_decrease': [4.0],
   'n_estimators': [82.0]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2023, 3, 5, 1, 39, 6, 232000),
 'refresh_time': datetime.datetime(2023, 3, 5, 1, 39, 10, 936000)}

In [30]:
import optuna

In [31]:
def optuna_objective(trial):
    
    #定义参数空间
    n_estimators = trial.suggest_int("n_estimators",80,100,1) #整数型，(参数名称，下界，上界，步长)
    max_depth = trial.suggest_int("max_depth",10,25,1)
    max_features = trial.suggest_int("max_features",10,20,1)
    #max_features = trial.suggest_categorical("max_features",["log2","sqrt","auto"]) #字符型
    min_impurity_decrease = trial.suggest_int("min_impurity_decrease",0,5,1)
    #min_impurity_decrease = trial.suggest_float("min_impurity_decrease",0,5,log=False) #浮点型
    
    #定义评估器
    #需要优化的参数由上述参数空间决定
    #不需要优化的参数则直接填写具体值
    reg = RFR(n_estimators = n_estimators
              ,max_depth = max_depth
              ,max_features = max_features
              ,min_impurity_decrease = min_impurity_decrease
              ,random_state=1412
              ,verbose=False
              ,n_jobs=-1
             )
    
    #交叉验证过程，输出负均方根误差(-RMSE)
    #optuna同时支持最大化和最小化，因此如果输出-RMSE，则选择最大化
    #如果选择输出RMSE，则选择最小化
    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    validation_loss = cross_validate(reg,X,y
                                     ,scoring="neg_root_mean_squared_error"
                                     ,cv=cv #交叉验证模式
                                     ,verbose=False #是否打印进程
                                     ,n_jobs=-1 #线程数
                                     ,error_score='raise'
                                    )
    #最终输出RMSE
    return np.mean(abs(validation_loss["test_score"]))

In [32]:
def optimizer_optuna(n_trials, algo):
    
    #定义使用TPE或者GP
    if algo == "TPE":
        algo = optuna.samplers.TPESampler(n_startup_trials = 10, n_ei_candidates = 24)
    elif algo == "GP":
        from optuna.integration import SkoptSampler
        import skopt
        algo = SkoptSampler(skopt_kwargs={'base_estimator':'GP', #选择高斯过程
                                          'n_initial_points':10, #初始观测点10个
                                          'acq_func':'EI'} #选择的采集函数为EI，期望增量
                           )
    
    #实际优化过程，首先实例化优化器
    study = optuna.create_study(sampler = algo #要使用的具体算法
                                , direction="minimize" #优化的方向，可以填写minimize或maximize
                               )
    #开始优化，n_trials为允许的最大迭代次数
    #由于参数空间已经在目标函数中定义好，因此不需要输入参数空间
    study.optimize(optuna_objective #目标函数
                   , n_trials=n_trials #最大迭代次数（包括最初的观测值的）
                   , show_progress_bar=True #要不要展示进度条呀？
                  )
    
    #可直接从优化好的对象study中调用优化的结果
    #打印最佳参数与最佳损失值
    print("\n","\n","best params: ", study.best_trial.params,
          "\n","\n","best score: ", study.best_trial.values,
          "\n")
    
    return study.best_trial.params, study.best_trial.values

In [33]:
import warnings
warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')

In [34]:
best_params, best_score = optimizer_optuna(10,"GP")

[32m[I 2023-03-04 20:41:48,927][0m A new study created in memory with name: no-name-d2c70cd7-5b5e-4d2a-b10c-c00c2f3272b8[0m
  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2023-03-04 20:41:49,335][0m Trial 0 finished with value: 29236.318496308017 and parameters: {'n_estimators': 80, 'max_depth': 20, 'max_features': 11, 'min_impurity_decrease': 4}. Best is trial 0 with value: 29236.318496308017.[0m
[32m[I 2023-03-04 20:41:49,805][0m Trial 1 finished with value: 29442.97729453431 and parameters: {'n_estimators': 82, 'max_depth': 22, 'max_features': 17, 'min_impurity_decrease': 2}. Best is trial 0 with value: 29236.318496308017.[0m
[32m[I 2023-03-04 20:41:50,325][0m Trial 2 finished with value: 29166.29792534038 and parameters: {'n_estimators': 97, 'max_depth': 14, 'max_features': 12, 'min_impurity_decrease': 0}. Best is trial 2 with value: 29166.29792534038.[0m
[32m[I 2023-03-04 20:41:50,755][0m Trial 3 finished with value: 29431.393775417244 and parameters: {'n_estimators': 93, 'max_depth': 11, 'max_features': 10, 'min_impurity_decrease': 0}. Best is trial 2 with value: 29166.29792534038.[0m
[32m[I 2023-03-04 20:41:51,323][0m Trial 4

In [35]:
optuna.logging.set_verbosity(optuna.logging.ERROR) #关闭自动打印的info，只显示进度条
#optuna.logging.set_verbosity(optuna.logging.INFO)
best_params, best_score = optimizer_optuna(300,"TPE")

  0%|          | 0/300 [00:00<?, ?it/s]


 
 best params:  {'n_estimators': 85, 'max_depth': 10, 'max_features': 16, 'min_impurity_decrease': 1} 
 
 best score:  [28790.39833062145] 



In [36]:
optuna.logging.set_verbosity(optuna.logging.ERROR)
best_params, best_score = optimizer_optuna(300,"GP")

  0%|          | 0/300 [00:00<?, ?it/s]


 
 best params:  {'n_estimators': 100, 'max_depth': 10, 'max_features': 16, 'min_impurity_decrease': 0} 
 
 best score:  [28692.908874419536] 

