## <center>欢迎来到LightGBM的世界</center>

### Mac下的编译安装


#### 命令如下：

brew install cmake

brew install gcc --without-multilib


git clone --recursive https://github.com/Microsoft/LightGBM

cd LightGBM

export CXX=g++-7 CC=gcc-7

mkdir build

cd build

cmake ..

make -j4

pip3 install lightgbm


#### 温暖提示：

cd python-package

sudo python setup.py install 

会依然提示CMake没有安装的问题，至今不知道是为什么？

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score

### 读入原始数据，并进行数据集划分(数据集来源于官方网站)

In [3]:
train= pd.read_csv('multiclass.train', header=None, sep = '\t')
test= pd.read_csv('multiclass.test', header=None, sep = '\t')
num_train = train.shape[0]
kfolds = 0.9


X_train = train.ix[:int(kfolds * num_train),1:]
y_train = train.ix[:int(kfolds * num_train),0]

X_val = train.ix[int(kfolds * num_train):,1:]
y_val = train.ix[int(kfolds * num_train):,0]

X_test = test.ix[:,1:]
y_test = test.ix[:,0]

lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_val, y_val)

num_class = y_train.unique().max()+1


### 设置初始参数(交叉验证的参数可以不设置)

In [4]:
params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'metric': {'multi_error'},
            'num_class': num_class,
            'num_leaves': 80,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
            'verbose': 0,
            #'device':'gpu'
        }

### 交叉验证过程

In [29]:
min_merror = float('Inf')
best_params = {}
for learning_rate in [0.1]:
    for num_boost_round in range(30, 31):
        for max_depth in range(6, 8):
            params['learning_rate'] = learning_rate
            params['max_depth'] = max_depth
            cv_results = lgb.cv(
                            params,
                            lgb_train,
                            num_boost_round=num_boost_round,
                            seed=42,
                            nfold=3,
                            metrics=['multi_error'],
                            early_stopping_rounds=3
                          )
            mean_merror = pd.Series(cv_results['multi_error-mean']).min()
            boost_rounds = pd.Series(cv_results['multi_error-mean']).argmin()
            if mean_merror < min_merror:
                min_merror = mean_merror
                best_params['learning_rate'] = learning_rate
                best_params['num_boost_round'] = boost_rounds
                best_params['max_depth'] = max_depth

# setting best params
params['learning_rate'] = best_params['learning_rate']
params['max_depth'] = best_params['max_depth']
num_round = best_params['num_boost_round']

# using lgb_test as valid_sets
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=num_round,
                )

### 数据预测

In [19]:
y_pred = gbm.predict(X_test)
acc = accuracy_score(y_test, np.argmax(y_pred, axis=1))
print(acc)

0.378


### 模型保存

In [20]:
gbm.save_model('model.001')

### 天池-蚂蚁金服：商场中精确定位用户所在店铺(部分关键代码)

In [None]:
        sz = train.shape
        kfolds = 0.8
        train_X = train[:int(sz[0] * kfolds), :]
        train_Y = label[:int(sz[0] * kfolds)]
        
        val_X = train[int(sz[0] * kfolds) : int(sz[0] * (kfolds + 0.1)), :]
        val_Y = label[int(sz[0] * kfolds) : int( sz[0] * (kfolds + 0.1) )]
        
        test_X = train[int(sz[0] * (kfolds + 0.1 )):, :]
        test_Y = label[int( sz[0] * (kfolds + 0.1) ):]

        ltrain = lgb.Dataset(train_X, label=train_Y)
        lval = lgb.Dataset(val_X, label=val_Y)

        num_class = int( max(label) ) + 1

        params = {  'task': 'train',
                    'boosting_type': 'gbdt',
                    'objective': 'multiclass',
                    'metric': {'multi_error'},
                    'num_class': num_class,
                    'num_leaves': 64,
                    'feature_fraction': 0.9,
                    'bagging_fraction': 0.8,
                    'bagging_freq': 4,
                    'verbose':0,
                    #'device': 'gpu'
            }
        num_boost_round = 30
        model = lgb.train(  params,
                            ltrain,
                            num_boost_round,
                            valid_sets=lval,
                            early_stopping_rounds=5
                            )
        y_pred = model.predict(test_X, num_iteration=model.best_iteration)

        acc = accuracy_score(test_Y, np.argmax(y_pred, axis=1))
        print('Val acc is {}'.format(acc))
        with open('test.csv', 'a') as f:
            f.write(str(mall_file.split('.')[0])+','+str(acc)+'\n')
        model_path = model_dir+str( mall_file.split('.')[0] )+'.model'
        model.save_model(model_path, num_iteration=model.best_iteration)    

思考：蚂蚁金服的代码中是在设定一组参数之后，根据验证集选择**num_boost_round**这个参数。早停止策略是独立于选参过程的，如果在train中设置了早停止策略，**注意模型并非在最佳num_boost_round处停止**，所以，在模型推断(inference)和模型持久化(persistence)时，要指定模型的最佳迭代次数**model.best_iteration**。

和上述的交叉验证过程进行对比: 在训练数据较少的前提下，可以使用交叉验证充分利用已有数据进行参数选择。而上述过程其实是固定验证集的参数选择过程。一般来说，在给定模型超参数的前提下，可以通过比较最佳训练轮次对应的loss进行参数选择。在早停止策略下，比较最佳训练轮次附近的loss也是可以的。个人观点，当选择完超参数之后，需要在验证集上重新选择**num_boost_round**参数。