In [1]:
# coding: utf-8
# pylint: disable = invalid-name, C0111
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

try:
    import cPickle as pickle
except BaseException:
    import pickle

In [2]:
print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('./binary_data/binary.train', header=None, sep='\t')
df_test = pd.read_csv('./binary_data/binary.test', header=None, sep='\t')
W_train = pd.read_csv('./binary_data/binary.train.weight', header=None)[0]
W_test = pd.read_csv('./binary_data/binary.test.weight', header=None)[0]

Loading data...


In [3]:
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

num_train, num_feature = X_train.shape
num_train, num_feature

(7000, 28)

### weight生命每一列的权重，free_raw_data 来重用数据

In [4]:
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False    重用数据
lgb_train = lgb.Dataset(X_train, y_train,
                        weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
                       weight=W_test, free_raw_data=False)

In [5]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

### 验证数据是训练数据，如果没有不显示训练过程

In [6]:
# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]

print('Starting training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,  # eval training data
                feature_name=feature_name,
                categorical_feature=[21])
print('Finished first 10 rounds...')

Starting training...
[1]	training's binary_logloss: 0.680298
[2]	training's binary_logloss: 0.672021
[3]	training's binary_logloss: 0.664444
[4]	training's binary_logloss: 0.655536
[5]	training's binary_logloss: 0.647375
[6]	training's binary_logloss: 0.640788
[7]	training's binary_logloss: 0.635012
[8]	training's binary_logloss: 0.628454
[9]	training's binary_logloss: 0.622423
[10]	training's binary_logloss: 0.616808
Finished first 10 rounds...


New categorical_feature is [21]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [7]:
# check feature name
print('7th feature name is:', lgb_train.feature_name[6])

# feature names
print('Feature names:', gbm.feature_name())

# feature importances
print('Feature importances:', list(gbm.feature_importance()))

7th feature name is: feature_6
Feature names: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27']
Feature importances: [8, 4, 0, 19, 8, 36, 3, 0, 2, 10, 5, 1, 0, 9, 5, 3, 0, 2, 2, 5, 1, 0, 35, 3, 28, 45, 31, 35]


### 保存成.txt、JSON格式

In [8]:
print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Dumping model to JSON...')
# dump model to JSON (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)



Saving model...
Dumping model to JSON...


In [9]:
print('Loading model to predict...')
# load model to predict
bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

Loading model to predict...
The rmse of loaded model's prediction is: 0.4618189809505519


### 模型保存成.pkl和加载.pkl模型

In [10]:
print('Dumping and loading model with pickle...')
# dump model with pickle
with open('model.pkl', 'wb') as fout:
    pickle.dump(gbm, fout)
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
    pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
# eval with loaded model
print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

Dumping and loading model with pickle...
The rmse of pickled model's prediction is: 0.46989528982016704


In [11]:
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model='model.txt',
                valid_sets=lgb_eval)

print('Finished 10 - 20 rounds with model file...')

[11]	valid_0's binary_logloss: 0.613941
[12]	valid_0's binary_logloss: 0.610317
[13]	valid_0's binary_logloss: 0.606257
[14]	valid_0's binary_logloss: 0.601789
[15]	valid_0's binary_logloss: 0.597803
[16]	valid_0's binary_logloss: 0.594579
[17]	valid_0's binary_logloss: 0.590794
[18]	valid_0's binary_logloss: 0.58741
[19]	valid_0's binary_logloss: 0.584296
[20]	valid_0's binary_logloss: 0.581739
Finished 10 - 20 rounds with model file...


New categorical_feature is [21]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


### 设置init_model 可以接着上面继续训练

In [12]:
# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                valid_sets=lgb_eval)

print('Finished 20 - 30 rounds with decay learning rates...')

[21]	valid_0's binary_logloss: 0.613941
[22]	valid_0's binary_logloss: 0.610352
[23]	valid_0's binary_logloss: 0.60637
[24]	valid_0's binary_logloss: 0.602024
[25]	valid_0's binary_logloss: 0.598221
[26]	valid_0's binary_logloss: 0.595039
[27]	valid_0's binary_logloss: 0.591429
[28]	valid_0's binary_logloss: 0.588352
[29]	valid_0's binary_logloss: 0.585486
[30]	valid_0's binary_logloss: 0.582613
Finished 20 - 30 rounds with decay learning rates...




### 通过callbacks在训练过程中更改参数

In [13]:
# change other parameters during training
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

print('Finished 30 - 40 rounds with changing bagging_fraction...')



[31]	valid_0's binary_logloss: 0.613621
[32]	valid_0's binary_logloss: 0.609283
[33]	valid_0's binary_logloss: 0.605998
[34]	valid_0's binary_logloss: 0.601608
[35]	valid_0's binary_logloss: 0.597791
[36]	valid_0's binary_logloss: 0.594658
[37]	valid_0's binary_logloss: 0.59106
[38]	valid_0's binary_logloss: 0.588155
[39]	valid_0's binary_logloss: 0.585972
[40]	valid_0's binary_logloss: 0.584073
Finished 30 - 40 rounds with changing bagging_fraction...


### 自定义训练过程中的目标函数 和 验证函数

In [14]:
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelihood(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess


# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# binary error
def binary_error(preds, train_data):
    labels = train_data.get_label()
    return 'error', np.mean(labels != (preds > 0.5)), False


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
                feval=binary_error,
                valid_sets=lgb_eval)

print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')



[41]	valid_0's binary_logloss: 4.6809	valid_0's error: 0.408
[42]	valid_0's binary_logloss: 4.49648	valid_0's error: 0.404
[43]	valid_0's binary_logloss: 4.54169	valid_0's error: 0.396
[44]	valid_0's binary_logloss: 4.76092	valid_0's error: 0.38
[45]	valid_0's binary_logloss: 4.7524	valid_0's error: 0.384
[46]	valid_0's binary_logloss: 4.74017	valid_0's error: 0.376
[47]	valid_0's binary_logloss: 4.72303	valid_0's error: 0.372
[48]	valid_0's binary_logloss: 4.71548	valid_0's error: 0.366
[49]	valid_0's binary_logloss: 4.78898	valid_0's error: 0.368
[50]	valid_0's binary_logloss: 5.00385	valid_0's error: 0.358
Finished 40 - 50 rounds with self-defined objective function and eval metric...


### 设置多个验证函数

In [15]:
# another self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# accuracy
def accuracy(preds, train_data):
    labels = train_data.get_label()
    return 'accuracy', np.mean(labels == (preds > 0.5)), True


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
                feval=lambda preds, train_data: [binary_error(preds, train_data),
                                                 accuracy(preds, train_data)],
                valid_sets=lgb_eval)

print('Finished 50 - 60 rounds with self-defined objective function '
      'and multiple self-defined eval metrics...')

[51]	valid_0's binary_logloss: 4.6809	valid_0's error: 0.408	valid_0's accuracy: 0.592
[52]	valid_0's binary_logloss: 4.49648	valid_0's error: 0.404	valid_0's accuracy: 0.596
[53]	valid_0's binary_logloss: 4.54169	valid_0's error: 0.396	valid_0's accuracy: 0.604
[54]	valid_0's binary_logloss: 4.76092	valid_0's error: 0.38	valid_0's accuracy: 0.62
[55]	valid_0's binary_logloss: 4.7524	valid_0's error: 0.384	valid_0's accuracy: 0.616
[56]	valid_0's binary_logloss: 4.74017	valid_0's error: 0.376	valid_0's accuracy: 0.624
[57]	valid_0's binary_logloss: 4.72303	valid_0's error: 0.372	valid_0's accuracy: 0.628
[58]	valid_0's binary_logloss: 4.71548	valid_0's error: 0.366	valid_0's accuracy: 0.634
[59]	valid_0's binary_logloss: 4.78898	valid_0's error: 0.368	valid_0's accuracy: 0.632
[60]	valid_0's binary_logloss: 5.00385	valid_0's error: 0.358	valid_0's accuracy: 0.642
Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...




### 通过callbacks在训练一定次数加入新的验证集

In [16]:
print('Starting a new training job...')


# callback
def reset_metrics():
    def callback(env):
        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
        if env.iteration - env.begin_iteration == 5:
            print('Add a new valid dataset at iteration 5...')
            env.model.add_valid(lgb_eval_new, 'new_valid')
    callback.before_iteration = True
    callback.order = 0
    return callback


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,
                callbacks=[reset_metrics()])

print('Finished first 10 rounds with callback function...')

Starting a new training job...
[1]	training's binary_logloss: 0.611255
[2]	training's binary_logloss: 0.606714
[3]	training's binary_logloss: 0.602329
[4]	training's binary_logloss: 0.597525
[5]	training's binary_logloss: 0.592888
Add a new valid dataset at iteration 5...
[6]	training's binary_logloss: 0.588917	new_valid's binary_logloss: 0.659461
[7]	training's binary_logloss: 0.585157	new_valid's binary_logloss: 0.654497
[8]	training's binary_logloss: 0.581203	new_valid's binary_logloss: 0.649638
[9]	training's binary_logloss: 0.577482	new_valid's binary_logloss: 0.644992
[10]	training's binary_logloss: 0.574196	new_valid's binary_logloss: 0.6413
Finished first 10 rounds with callback function...


