sklearn接口形式的LightGBM示例

In [1]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
 
# 加载数据
iris = load_iris()
data = iris.data
target = iris.target
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
 
# 创建模型，训练模型
gbm = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1', early_stopping_rounds=5)
 
# 测试机预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
 
# 模型评估
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
 
# feature importances
print('Feature importances:', list(gbm.feature_importances_))
 
# 网格搜索，参数优化
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = {'learning_rate': [0.01, 0.1, 1],
        'n_estimators': [20, 40]}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(X_train, y_train)
print('Best parameters found by grid search are:', gbm.best_params_)

[1]	valid_0's l2: 0.548976	valid_0's l1: 0.586673
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.501458	valid_0's l1: 0.570815
[3]	valid_0's l2: 0.459358	valid_0's l1: 0.55575
[4]	valid_0's l2: 0.422074	valid_0's l1: 0.541409
[5]	valid_0's l2: 0.385989	valid_0's l1: 0.52167
[6]	valid_0's l2: 0.353693	valid_0's l1: 0.50289
[7]	valid_0's l2: 0.324833	valid_0's l1: 0.485049
[8]	valid_0's l2: 0.29845	valid_0's l1: 0.470411
[9]	valid_0's l2: 0.275244	valid_0's l1: 0.454284
[10]	valid_0's l2: 0.252343	valid_0's l1: 0.43693
[11]	valid_0's l2: 0.235938	valid_0's l1: 0.426834
[12]	valid_0's l2: 0.21791	valid_0's l1: 0.41136
[13]	valid_0's l2: 0.201539	valid_0's l1: 0.396568
[14]	valid_0's l2: 0.186997	valid_0's l1: 0.382557
[15]	valid_0's l2: 0.175209	valid_0's l1: 0.370683
[16]	valid_0's l2: 0.16522	valid_0's l1: 0.361234
[17]	valid_0's l2: 0.154086	valid_0's l1: 0.349102
[18]	valid_0's l2: 0.144289	valid_0's l1: 0.337498
[19]	valid_0's l2: 0.134992	valid_0's 



原生形式使用lightgbm

In [2]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
 
iris = load_iris()
data = iris.data
target = iris.target
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
 
# 创建成lgb特征的数据集格式
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
 
# 将参数写成字典下形式
params = {'task': 'train',
      'boosting_type': 'gbdt',  # 设置提升类型
      'objective': 'regression',  # 目标函数
      'metric': {'l2', 'auc'},  # 评估函数
      'num_leaves': 31,  # 叶子节点数
      'learning_rate': 0.05,  # 学习速率
      'feature_fraction': 0.9,  # 建树的特征选择比例
      'bagging_fraction': 0.8,  # 建树的样本采样比例
      'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
      'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息}
 
# 训练 cv and train
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=5)
 
# 保存模型到文件
gbm.save_model('model.txt')
 
# 预测数据集
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 
# 评估模型
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

[1]	valid_0's auc: 1	valid_0's l2: 0.548139
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 1	valid_0's l2: 0.49587
[3]	valid_0's auc: 1	valid_0's l2: 0.451509
[4]	valid_0's auc: 1	valid_0's l2: 0.408314
[5]	valid_0's auc: 1	valid_0's l2: 0.369286
[6]	valid_0's auc: 1	valid_0's l2: 0.334835
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's l2: 0.548139
The rmse of prediction is: 0.7403639345096463
