In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

In [2]:
class Node( ):
    def __init__(self):
        self.Value = 0
        self.Depth = 1
        self.Left = None 
        self.Right = None 
        self.Threshold = None
        self.Is_terminal = None 
        self.Feature = None

In [3]:
class RegTree( ):
    def __init__(self,max_depth = 10, min_sample_leaf = 1,min_sample_split = 2):
        self.max_depth = max_depth 
        self.min_sample_leaf = min_sample_leaf
        self.min_sample_split = min_sample_split
        self.Tree = None
        
    def Avg(self,y):
        return sum(y) / y.shape[ 0 ]
        
    def MSE( self,y,c ):
        return sum((y-c)**2)
    
    def C_mj(self,y,residual):
        nominator =  sum( residual )
        denominator = sum(np.multiply(y - residual, 1 - y + residual))
        return nominator / denominator
    
    def BestSplitFeature(self,X,y,residual):
        BestFeature = None
        Threshold = None
        MinMse = np.inf 
        for index in range( X.shape[ 1 ]):
            fea_i = X[ :,index ]
            max_fea_i = max(fea_i)
            min_fea_i = min(fea_i)
            step = (max_fea_i - min_fea_i) /10
            n = 1
            while min_fea_i + n * step < max_fea_i:
                threshold = min_fea_i + n * step
                R1 = residual[ fea_i <= threshold ]
                R2 = residual[ fea_i > threshold ]
                
                if R1.shape[ 0 ]==0 or R2.shape[ 0 ]==0:
                    continue
                    
                c1 = self.Avg( R1 )
                c2 = self.Avg( R2 )
                mse = self.MSE(R1,c1) + self.MSE(R2,c2)
                
                if MinMse > mse:
                    MinMse = mse
                    BestFeature = index
                    Threshold = threshold
                n += 1
        X_col = X[:,BestFeature]
        X_left = X[ X_col<= Threshold,: ]
        X_right = X[ X_col > Threshold,: ]
        y_left = y[ X_col<= Threshold ]
        y_right = y[ X_col > Threshold ]
        residual_left = residual[ X_col<= Threshold ]
        residual_right = residual[ X_col > Threshold ]
        
        return BestFeature,Threshold,X_left,y_left,residual_left,X_right, y_right,residual_right
    
    def CreateTree(self,X,y,node,residual):
        if node.Depth > self.max_depth or X.shape[ 0 ] <= self.min_sample_split:
            node.Is_terminal = True
            return 
        
        splitCol,thresh,X_left,y_left,residual_left,X_right, y_right,residual_right = (
            self.BestSplitFeature(X,y,residual))
        if X_left.shape[ 0 ] < self.min_sample_leaf or X_right.shape[ 0 ] < self.min_sample_leaf:
            node.Is_terminal = True
            return 
        
        node.Feature = splitCol
        node.Threshlod = thresh
        
        node.Left = Node()
        node.Left.Depth = node.Depth + 1
        node.Left.Value = self.C_mj( y_left,residual_left )
        
        node.Right = Node()
        node.Right.Depth = node.Depth + 1
        node.Right.Value = self.C_mj( y_right,residual_right )
        
        self.CreateTree( X_left,y_left,node.Left,residual_left )
        self.CreateTree( X_right,y_right,node.Right,residual_right )
        
    def fit(self,X,y,residual):
        self.Tree = Node()
        self.Tree.Value = 0
        self.CreateTree( X,y,self.Tree,residual )
        
    def predictSample(self,x,node):
        if node.Is_terminal:
            return node.Value
        
        if x[node.Feature] > node.Threshlod:
            y_pred =  self.predictSample( x,node.Right )
        else:
            y_pred =  self.predictSample( x,node.Left )
            
        return y_pred
    
    def predict( self,X ):
        y_preds = [ ]
        for x in X:
            y_pred = self.predictSample(x,self.Tree)
            y_preds.append( y_pred )
        return np.asarray( y_preds ) 

In [4]:
class GradientBoostTree( ):
    def __init__(self,learning_rate = 0.01,n_trees = 10,max_depth = 10,min_sample_leaf = 1,
                 min_sample_split = 2):
        self.learning_rate = learning_rate
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_sample_leaf = min_sample_leaf
        self.min_sample_split = min_sample_split
        self.models = [ ]
        self.first_item = 0
        
    def Sigmoid(self,f):
        return 1 / ( 1 + np.exp( -f ))
    
    def fit(self,X,y):
        y_f = np.log( y[ y==1 ].shape[ 0 ] / y[ y==0 ].shape[ 0 ])
        self.first_item =  y_f
        y_prob = self.Sigmoid( y_f )
        for i in range( self.n_trees - 1 ):
            residual = y - y_prob
            tree = RegTree( max_depth = self.max_depth, 
                           min_sample_leaf = self.min_sample_leaf,
                           min_sample_split = self.min_sample_split )
            tree.fit(X,y,residual)
            self.models.append( tree )
            y_prob = self.Sigmoid( self.model_sum( X )  )
            
    def model_sum(self,X):
        y_sum = np.sum(np.array([ model.predict(X) for model in self.models ]),axis = 0)
        y_sum = self.first_item + self.learning_rate * y_sum
        return y_sum
    
    def predict(self,X):
        y_sum = self.model_sum( X )
        y_prob = self.Sigmoid( y_sum )
        return np.where(y_prob < 0.5,0,1)
    
if __name__ == '__main__':
    datas = load_breast_cancer()
    X = datas.data
    y = datas.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)
    model = GradientBoostTree(learning_rate = 0.01,n_trees = 100,max_depth = 3)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('梯度提升树的准确率是:', accuracy_score(y_test, y_pred))

梯度提升树的准确率是: 0.956140350877193


In [5]:
datas = load_breast_cancer()
X = datas.data
y = datas.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

In [8]:
from xgboost import XGBClassifier

In [9]:
model = GradientBoostingClassifier() 

In [10]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('梯度提升树的准确率是:', accuracy_score(y_test, y_pred))

梯度提升树的准确率是: 0.956140350877193
