In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.grid_search import GridSearchCV  # Perforing grid search
from sklearn.model_selection import train_test_split



In [2]:
print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('./data/regression.train', header=None, sep='\t')
df_test = pd.read_csv('./data/regression.test', header=None, sep='\t')

Loading data...


In [3]:
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

### 划分训练集和验证集

In [4]:
train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.2, random_state= 2019)   # 分训练集和验证集

### Dataset类型数据

In [5]:
train = lgb.Dataset(train_x, train_y)
valid = lgb.Dataset(valid_x, valid_y, reference=train)

### LightGBM调参过程

1. n_estimators、lr
2. max_depth和num_leaves
3. min_data_in_leaf和max_bin in
4. feature_fraction、bagging_fraction、bagging_freq
5. lambda_l1和lambda_l2
6.  min_split_gain 
7. 降低学习率，增加迭代次数，验证模型，防止过拟合

### LightGBM 可以调节的参数

In [6]:
"""from lightgbm.sklearn import LGBMRegressor
lgb_model = LGBMRegressor( 
    boosting_type='gbdt', 
    num_leaves=30, 
    max_depth=79, 
    learning_rate=0.1, 
    n_estimators=100,               #  1 
    max_bin=255, 
    subsample_for_bin=50000, 
    objective='regression', 
    min_split_gain=0, 
    min_child_weight=5, 
    min_child_samples=10, 
    subsample=1, 
    subsample_freq=1, 
    colsample_bytree=1, 
    reg_alpha=0, 
    reg_lambda=0, 
    seed=0, 
    nthread=-1, 
    silent=True, 
    huber_delta=1.0, 
    gaussian_eta=1.0, 
    fair_c=1.0, 
    poisson_max_delta_step=0.7, 
    drop_rate=0.1, 
    skip_drop=0.5, 
    max_drop=50, 
    uniform_drop=False, 
    xgboost_dart_mode=False,
    verbose_eval=100
)"""

"from lightgbm.sklearn import LGBMRegressor\nlgb_model = LGBMRegressor( \n    boosting_type='gbdt', \n    num_leaves=30, \n    max_depth=79, \n    learning_rate=0.1, \n    n_estimators=100,               #  1 \n    max_bin=255, \n    subsample_for_bin=50000, \n    objective='regression', \n    min_split_gain=0, \n    min_child_weight=5, \n    min_child_samples=10, \n    subsample=1, \n    subsample_freq=1, \n    colsample_bytree=1, \n    reg_alpha=0, \n    reg_lambda=0, \n    seed=0, \n    nthread=-1, \n    silent=True, \n    huber_delta=1.0, \n    gaussian_eta=1.0, \n    fair_c=1.0, \n    poisson_max_delta_step=0.7, \n    drop_rate=0.1, \n    skip_drop=0.5, \n    max_drop=50, \n    uniform_drop=False, \n    xgboost_dart_mode=False,\n    verbose_eval=100\n)"

In [7]:
{'bagging_fraction': 0.7,
 'bagging_freq': 30,
 'feature_fraction': 0.8,
 'lambda_l1': 0.1,
 'lambda_l2': 0.0,
 'max_bin': 255,
 'max_depth': 4,
 'min_data_in_leaf': 81,
 'min_split_gain': 0.1,
 'num_leaves': 10}

{'bagging_fraction': 0.7,
 'bagging_freq': 30,
 'feature_fraction': 0.8,
 'lambda_l1': 0.1,
 'lambda_l2': 0.0,
 'max_bin': 255,
 'max_depth': 4,
 'min_data_in_leaf': 81,
 'min_split_gain': 0.1,
 'num_leaves': 10}

调参方向：处理过拟合（过拟合和准确率往往相反）

使用较小的 max_bin

使用较小的 num_leaves

使用 min_data_in_leaf 和 min_sum_hessian_in_leaf

通过设置 bagging_fraction 和 bagging_freq 来使用 bagging

通过设置 feature_fraction <1来使用特征抽样

使用更大的训练数据

使用 lambda_l1, lambda_l2 和 min_gain_to_split 来使用正则

尝试 max_depth 来避免生成过深的树

链接：https://www.jianshu.com/p/1100e333fcab


### LightGBM的例子

In [8]:
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

In [9]:
model = lgb.LGBMRegressor(**other_params)  # lgb.LGBMClassifier

In [10]:
cv_params = {'n_estimators': [400, 500, 600, 700, 800]}
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.grid_scores_
print('每轮迭代运行结果:{0}'.format(evalute_result))
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:   14.2s finished


每轮迭代运行结果:[mean: 0.17986, std: 0.01637, params: {'n_estimators': 400}, mean: 0.17260, std: 0.01645, params: {'n_estimators': 500}, mean: 0.16424, std: 0.01798, params: {'n_estimators': 600}, mean: 0.15851, std: 0.01998, params: {'n_estimators': 700}, mean: 0.15323, std: 0.01958, params: {'n_estimators': 800}]
参数的最佳取值：{'n_estimators': 400}
最佳模型得分:0.17986297343575536


### xgboost的例子

In [11]:
# model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603,gamma=0.0468,
#                             learning_rate = 0.05,max_depth =3,
#                             min_child_weight = 1.7817,n_estimators = 2200,
#                             reg_alpha=0.4640,reg_lambda=0.8571,
#                             subsample=0.5213,silent=1,nthread= -1)

cv_params = {'n_estimators': [400, 500, 600, 700, 800]}
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.grid_scores_
print('每轮迭代运行结果:{0}'.format(evalute_result))
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

NameError: name 'xgb' is not defined