#### gbdt的apply()的作用：

把特征转换到一个更高维空间形成稀疏矩阵，然后用线性模型进行概率预估了。GBDT后面的模型需要根据情况来做不同的特征工程，如果是LR就是one-hot编码，如果是LibFFM，就直接用index，然后直接把GBDT输出的向量作为高级特征加到特征向量中去。

#### LogisticRegressionWithLBFGS——逻辑回归

L-BFGS算法就是对拟牛顿算法的一个改进。它的名字已经告诉我们它是基于拟牛顿法BFGS算法的改进。

In [16]:
from sklearn.ensemble import GradientBoostingClassifier as SGBClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

# databases
from sklearn.datasets import load_breast_cancer

In [54]:
class GBDTClassifier(object):
    """ fit a GBDT Model via sklearn or xgboost """
    def __init__(self, name='xgboost', **parameters):
        """Initialize a gbdt classifier
           parameters:
               name: 'sklearn' or 'xgboost'
               estimators: number of subtrees
               max_depth: max depth of a subtree
               learning_rate: learning rate
               max_leaf_nodes: max leaf nodes of a subtree
        """
        assert(name == 'sklearn' or name == 'xgboost')
        
        self.__model_name = name # 模型名字
        self.__parameters = parameters # gbdt参数
        self.__feature_encoder = OneHotEncoder() # 编码器
        self.__classifier = None # 分类器
        
        # new一个分类模型
        if self.__model_name == 'sklearn': 
            self.__make_sklearn_model()
        elif self.__model_name == 'xgboost':
            self.__make_xgboost_model()
    
    def __apply(self, data):
        """ 得到一个高纬稀疏矩阵 """
        assert self.__classifier is not None
        
        applied_data = self.__classifier.apply(data)
        #print('applied_data[0]', applied_data[0])
        """        
        [[2.]
        [2.]
        [4.]
        [2.]
        [4.]
        [2.]]   
        """
        if self.__model_name == 'sklearn':
            applied_data = applied_data[:, :, 0]
            #print('new applied_data: ', applied_data)
            """
            [[2. 2. 4. 2. 4. 2.]
             [2. 2. 4. 2. 4. 2.]
             ...
            """
        return applied_data
    
    def __fit_onehot_encoder(self, data):
        """ 拟合onehot编码器 """
        applied_data = self.__apply(data)
        assert applied_data is not None
        self.__feature_encoder.fit(applied_data)
    
    def __transform_onehot_feature(self, data):
        """ 转换数据的onehot特征 """
        applied_data = self.__apply(data)
        encoded_feature = self.__feature_encoder.transform(applied_data).toarray()
        #print('encoded_feature: ', encoded_feature)
        """
         [[1. 0. 0. ... 1. 0. 0.]
         [1. 0. 0. ... 1. 0. 0.]
         ...
        """
        return encoded_feature
        
    def fit(self, samples, labels, split_rate=0.8):
        """ 拟合gbdt分类器 
        parameters:
            samples: shape is [n_samples, n_features]
            labels: shape is [n_samples, ]
            split_rate: split train and test dataset
        returns:
            transformed features of original dataset
        """
        assert samples.shape[0] == labels.shape[0]
        train_count = int(samples.shape[0] * split_rate)
        train_samples = samples[0: train_count]
        test_samples = samples[train_count: ]
        train_labels = labels[0: train_count]
        test_labels = labels[train_count: ]
        
        # 拟合数据训练出一个分类器
        self.__classifier = self.__model.fit(train_samples, train_labels)
        test_prob = self.__classifier.predict_proba(test_samples)
        test_prob = [prob[1] for prob in test_prob]
        auc = roc_auc_score(test_labels, test_prob)
        print('gbdt的%s模型, auc = %.5f' % (self.__model_name, auc))
        
        # 得到onehot编码器
        self.__fit_onehot_encoder(samples)
        return self.__transform_onehot_feature(samples)
        
    def predict_trees(self, data):
        """ gbdt分类器作为特征转换器 
        parameters:
            data: shape [n_samples, n_features]
        return:
            shape [n_samples, n_transformed_features]
        """
        return self.__transform_onehot_feature(data)
        
    def predict(self, data):
        """ 预测 """
        pass
    
    def __make_sklearn_model(self):
        """ gbdt分类模型 """
        estimators = self.__parameters['estimators']
        lrate = self.__parameters['learning_rate']
        max_depth = self.__parameters['max_depth']
        max_leaf_nodes = self.__parameters['max_leaf_nodes']
        self.__model = SGBClassifier(n_estimators=estimators,
                                    learning_rate=lrate,
                                    max_depth=max_depth,
                                    max_leaf_nodes=max_leaf_nodes,
                                    random_state=0)

    def __make_xgboost_model(self):
        """ xgboost分类模型 """
        estimators = self.__parameters['estimators']
        lrate = self.__parameters['learning_rate']
        max_depth = self.__parameters['max_depth']
        max_leaf_nodes = self.__parameters['max_leaf_nodes']
        self.__model = XGBClassifier(nthread=4,
                                    learning_rate=lrate,
                                    n_estimators=estimators,
                                     max_depth=max_depth,
                                     gamma=0,
                                     subsample=0.9,
                                     max_leaf_nodes=max_leaf_nodes,
                                     colsample_bytree=0.5
                                    )

In [69]:
class GBDTLRPipeline(object):
    """ GBDT+LR """
    def __init__(self, gb_classifier):
        self.__gb_classifier = gb_classifier
        self.__lr_classifier = None
        
    def fit(self, samples, labels, split_rate=0.8):
        
        # 得到树模型的高维稀疏向量
        tree_encoding_samples = self.__gb_classifier.fit(samples, labels, split_rate)
        print('tree_encoding_samples: ', tree_encoding_samples)
        
        # 将向量输入lr模型中进行训练
        self.__lr_train(tree_encoding_samples, labels, split_rate)
        
    def __lr_train(self, samples, labels, split_rate):
        """ LR模型 """
        assert samples.shape[0] == labels.shape[0]
        
        train_count = int(samples.shape[0] * split_rate)
        train_samples = samples[0: train_count]
        test_samples = samples[train_count: ]
        train_labels = labels[0: train_count]
        test_labels = labels[train_count: ]
        
        lr_model = LogisticRegression(random_state=2019, solver='lbfgs')
        self.__lr_classifier = lr_model.fit(train_samples, train_labels)
        test_prob = self.__lr_classifier.predict_proba(test_samples)
        test_prob = [prob[1] for prob in test_prob]
        auc = roc_auc_score(test_labels, test_prob)
        print("lr模型，auc是%.5f" % (auc))

    def predict(self, data):
        """ 预测 """
        return self.__lf_classifier.predict(self.__gb_classifier.predict_trees(data))
    
    def predict_proba(self, data):
        """ 预测概率 """
        prob = self.__lr_classifier.predict_proba(self.__gb_classifier.predict_trees(data))
        return [p[1] for p in prob]

In [75]:
if __name__ == '__main__':
    # 乳腺癌数据集X Y
    X, Y = load_breast_cancer(return_X_y=True)
    
    print('X.shape, Y.shape: ', X.shape, Y.shape)
    
     # gbdt模型参数
    ESTIMATORS = 6 # 迭代次数
    MAX_DEPTH = 2 # 最大深度
    LEARNING_RATE = 0.08 # 学习率
    MAX_LEAF_NODES = 3 # 最大叶节点数
    
    # 第一个GBDT树模型，得到分类模型及高纬稀疏矩阵
    sklearn_gbdt_model = GBDTClassifier('sklearn',
                                       estimators=ESTIMATORS,
                                       max_depth=MAX_DEPTH,
                                       learning_rate=LEARNING_RATE,
                                       max_leaf_nodes=MAX_LEAF_NODES)
    
    gbdt_lr_pipeline_model = GBDTLRPipeline(sklearn_gbdt_model)
    
    
    gbdt_lr_pipeline_model.fit(X, Y, 0.8)
        

    p = gbdt_lr_pipeline_model.predict_proba(X)

    predictd = zip(p, Y)

X.shape, Y.shape:  (569, 30) (569,)
gbdt的sklearn模型, auc = 0.98383
tree_encoding_samples:  [[1. 0. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 1. 0. 0.]
 ...
 [1. 0. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 1. 0. 0.]
 [0. 1. 0. ... 0. 1. 0.]]
lr模型，auc是0.98558


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [74]:
# 输出最后的预测概率
for i in predictd:
    print(i)

(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.012363827598055335, 0)
(0.01061838752229847, 0)
(0.012363827598055335, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.06578465342904098, 0)
(0.012363827598055335, 0)
(0.3292246333537733, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.06578465342904098, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.9810255599106332, 1)
(0.9810255599106332, 1)
(0.9810255599106332, 1)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.01061838752229847, 0)
(0.4946174868987164, 0)
(0.9810255599106332, 1)
(0.9810255599106332, 0)
(0.06578465342904098, 0)
(0.9